diff --git a/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp b/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp index b9659b4c01..bbcc2d170f 100644 --- a/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp +++ b/plugins/zzogl-pg/opengl/Linux/zzogl-pg/zzogl-pg.cbp @@ -78,6 +78,9 @@ + + + diff --git a/plugins/zzogl-pg/opengl/Makefile.am b/plugins/zzogl-pg/opengl/Makefile.am index 679a611d0b..c055ca4c82 100644 --- a/plugins/zzogl-pg/opengl/Makefile.am +++ b/plugins/zzogl-pg/opengl/Makefile.am @@ -26,7 +26,9 @@ libzzoglpg_LDADD=$(libzzoglpg_a_OBJECTS) libzzoglpg_a_SOURCES = \ GSmain.cpp GifTransfer.cpp memcpy_amd.cpp Regs.cpp x86.cpp zpipe.cpp Mem.cpp \ rasterfont.cpp targets.cpp zerogs.cpp ZZoglVB.cpp ZZoglShoots.cpp ZZoglCreate.cpp \ -ZZoglShaders.cpp ZZoglCRTC.cpp ZZoglSave.cpp ZZoglFlush.cpp +ZZoglShaders.cpp ZZoglCRTC.cpp ZZoglSave.cpp ZZoglFlush.cpp \ +Mem_Swizzle.h Mem_Tables.cpp Mem_Transmit.h + libzzoglpg_a_SOURCES += x86-32.S diff --git a/plugins/zzogl-pg/opengl/Mem.cpp b/plugins/zzogl-pg/opengl/Mem.cpp index 8d82f6ebca..3d67880f85 100644 --- a/plugins/zzogl-pg/opengl/Mem.cpp +++ b/plugins/zzogl-pg/opengl/Mem.cpp @@ -22,481 +22,12 @@ #include "targets.h" #include "x86.h" -u32 g_blockTable32[4][8] = { - { 0, 1, 4, 5, 16, 17, 20, 21}, - { 2, 3, 6, 7, 18, 19, 22, 23}, - { 8, 9, 12, 13, 24, 25, 28, 29}, - { 10, 11, 14, 15, 26, 27, 30, 31} -}; - -u32 g_blockTable32Z[4][8] = { - { 24, 25, 28, 29, 8, 9, 12, 13}, - { 26, 27, 30, 31, 10, 11, 14, 15}, - { 16, 17, 20, 21, 0, 1, 4, 5}, - { 18, 19, 22, 23, 2, 3, 6, 7} -}; - -u32 g_blockTable16[8][4] = { - { 0, 2, 8, 10 }, - { 1, 3, 9, 11 }, - { 4, 6, 12, 14 }, - { 5, 7, 13, 15 }, - { 16, 18, 24, 26 }, - { 17, 19, 25, 27 }, - { 20, 22, 28, 30 }, - { 21, 23, 29, 31 } -}; - -u32 g_blockTable16S[8][4] = { - { 0, 2, 16, 18 }, - { 1, 3, 17, 19 }, - { 8, 10, 24, 26 }, - { 9, 11, 25, 27 }, - { 4, 6, 20, 22 }, - { 5, 7, 21, 23 }, - { 12, 14, 28, 30 }, - { 13, 15, 29, 31 } -}; - -u32 g_blockTable16Z[8][4] = { - { 24, 26, 16, 18 }, - { 25, 27, 17, 19 }, - { 28, 30, 20, 22 }, - { 29, 31, 21, 23 }, - { 8, 10, 0, 2 }, - { 9, 11, 1, 3 }, - { 12, 14, 4, 6 }, - { 13, 15, 5, 7 } -}; - -u32 g_blockTable16SZ[8][4] = { - { 24, 26, 8, 10 }, - { 25, 27, 9, 11 }, - { 16, 18, 0, 2 }, - { 17, 19, 1, 3 }, - { 28, 30, 12, 14 }, - { 29, 31, 13, 15 }, - { 20, 22, 4, 6 }, - { 21, 23, 5, 7 } -}; - -u32 g_blockTable8[4][8] = { - { 0, 1, 4, 5, 16, 17, 20, 21}, - { 2, 3, 6, 7, 18, 19, 22, 23}, - { 8, 9, 12, 13, 24, 25, 28, 29}, - { 10, 11, 14, 15, 26, 27, 30, 31} -}; - -u32 g_blockTable4[8][4] = { - { 0, 2, 8, 10 }, - { 1, 3, 9, 11 }, - { 4, 6, 12, 14 }, - { 5, 7, 13, 15 }, - { 16, 18, 24, 26 }, - { 17, 19, 25, 27 }, - { 20, 22, 28, 30 }, - { 21, 23, 29, 31 } -}; - -u32 g_columnTable32[8][8] = { - { 0, 1, 4, 5, 8, 9, 12, 13 }, - { 2, 3, 6, 7, 10, 11, 14, 15 }, - { 16, 17, 20, 21, 24, 25, 28, 29 }, - { 18, 19, 22, 23, 26, 27, 30, 31 }, - { 32, 33, 36, 37, 40, 41, 44, 45 }, - { 34, 35, 38, 39, 42, 43, 46, 47 }, - { 48, 49, 52, 53, 56, 57, 60, 61 }, - { 50, 51, 54, 55, 58, 59, 62, 63 }, -}; - -u32 g_columnTable16[8][16] = { - { 0, 2, 8, 10, 16, 18, 24, 26, - 1, 3, 9, 11, 17, 19, 25, 27 }, - { 4, 6, 12, 14, 20, 22, 28, 30, - 5, 7, 13, 15, 21, 23, 29, 31 }, - { 32, 34, 40, 42, 48, 50, 56, 58, - 33, 35, 41, 43, 49, 51, 57, 59 }, - { 36, 38, 44, 46, 52, 54, 60, 62, - 37, 39, 45, 47, 53, 55, 61, 63 }, - { 64, 66, 72, 74, 80, 82, 88, 90, - 65, 67, 73, 75, 81, 83, 89, 91 }, - { 68, 70, 76, 78, 84, 86, 92, 94, - 69, 71, 77, 79, 85, 87, 93, 95 }, - { 96, 98, 104, 106, 112, 114, 120, 122, - 97, 99, 105, 107, 113, 115, 121, 123 }, - { 100, 102, 108, 110, 116, 118, 124, 126, - 101, 103, 109, 111, 117, 119, 125, 127 }, -}; - -u32 g_columnTable8[16][16] = { - { 0, 4, 16, 20, 32, 36, 48, 52, // column 0 - 2, 6, 18, 22, 34, 38, 50, 54 }, - { 8, 12, 24, 28, 40, 44, 56, 60, - 10, 14, 26, 30, 42, 46, 58, 62 }, - { 33, 37, 49, 53, 1, 5, 17, 21, - 35, 39, 51, 55, 3, 7, 19, 23 }, - { 41, 45, 57, 61, 9, 13, 25, 29, - 43, 47, 59, 63, 11, 15, 27, 31 }, - { 96, 100, 112, 116, 64, 68, 80, 84, // column 1 - 98, 102, 114, 118, 66, 70, 82, 86 }, - { 104, 108, 120, 124, 72, 76, 88, 92, - 106, 110, 122, 126, 74, 78, 90, 94 }, - { 65, 69, 81, 85, 97, 101, 113, 117, - 67, 71, 83, 87, 99, 103, 115, 119 }, - { 73, 77, 89, 93, 105, 109, 121, 125, - 75, 79, 91, 95, 107, 111, 123, 127 }, - { 128, 132, 144, 148, 160, 164, 176, 180, // column 2 - 130, 134, 146, 150, 162, 166, 178, 182 }, - { 136, 140, 152, 156, 168, 172, 184, 188, - 138, 142, 154, 158, 170, 174, 186, 190 }, - { 161, 165, 177, 181, 129, 133, 145, 149, - 163, 167, 179, 183, 131, 135, 147, 151 }, - { 169, 173, 185, 189, 137, 141, 153, 157, - 171, 175, 187, 191, 139, 143, 155, 159 }, - { 224, 228, 240, 244, 192, 196, 208, 212, // column 3 - 226, 230, 242, 246, 194, 198, 210, 214 }, - { 232, 236, 248, 252, 200, 204, 216, 220, - 234, 238, 250, 254, 202, 206, 218, 222 }, - { 193, 197, 209, 213, 225, 229, 241, 245, - 195, 199, 211, 215, 227, 231, 243, 247 }, - { 201, 205, 217, 221, 233, 237, 249, 253, - 203, 207, 219, 223, 235, 239, 251, 255 }, -}; - -u32 g_columnTable4[16][32] = { - { 0, 8, 32, 40, 64, 72, 96, 104, // column 0 - 2, 10, 34, 42, 66, 74, 98, 106, - 4, 12, 36, 44, 68, 76, 100, 108, - 6, 14, 38, 46, 70, 78, 102, 110 }, - { 16, 24, 48, 56, 80, 88, 112, 120, - 18, 26, 50, 58, 82, 90, 114, 122, - 20, 28, 52, 60, 84, 92, 116, 124, - 22, 30, 54, 62, 86, 94, 118, 126 }, - { 65, 73, 97, 105, 1, 9, 33, 41, - 67, 75, 99, 107, 3, 11, 35, 43, - 69, 77, 101, 109, 5, 13, 37, 45, - 71, 79, 103, 111, 7, 15, 39, 47 }, - { 81, 89, 113, 121, 17, 25, 49, 57, - 83, 91, 115, 123, 19, 27, 51, 59, - 85, 93, 117, 125, 21, 29, 53, 61, - 87, 95, 119, 127, 23, 31, 55, 63 }, - { 192, 200, 224, 232, 128, 136, 160, 168, // column 1 - 194, 202, 226, 234, 130, 138, 162, 170, - 196, 204, 228, 236, 132, 140, 164, 172, - 198, 206, 230, 238, 134, 142, 166, 174 }, - { 208, 216, 240, 248, 144, 152, 176, 184, - 210, 218, 242, 250, 146, 154, 178, 186, - 212, 220, 244, 252, 148, 156, 180, 188, - 214, 222, 246, 254, 150, 158, 182, 190 }, - { 129, 137, 161, 169, 193, 201, 225, 233, - 131, 139, 163, 171, 195, 203, 227, 235, - 133, 141, 165, 173, 197, 205, 229, 237, - 135, 143, 167, 175, 199, 207, 231, 239 }, - { 145, 153, 177, 185, 209, 217, 241, 249, - 147, 155, 179, 187, 211, 219, 243, 251, - 149, 157, 181, 189, 213, 221, 245, 253, - 151, 159, 183, 191, 215, 223, 247, 255 }, - { 256, 264, 288, 296, 320, 328, 352, 360, // column 2 - 258, 266, 290, 298, 322, 330, 354, 362, - 260, 268, 292, 300, 324, 332, 356, 364, - 262, 270, 294, 302, 326, 334, 358, 366 }, - { 272, 280, 304, 312, 336, 344, 368, 376, - 274, 282, 306, 314, 338, 346, 370, 378, - 276, 284, 308, 316, 340, 348, 372, 380, - 278, 286, 310, 318, 342, 350, 374, 382 }, - { 321, 329, 353, 361, 257, 265, 289, 297, - 323, 331, 355, 363, 259, 267, 291, 299, - 325, 333, 357, 365, 261, 269, 293, 301, - 327, 335, 359, 367, 263, 271, 295, 303 }, - { 337, 345, 369, 377, 273, 281, 305, 313, - 339, 347, 371, 379, 275, 283, 307, 315, - 341, 349, 373, 381, 277, 285, 309, 317, - 343, 351, 375, 383, 279, 287, 311, 319 }, - { 448, 456, 480, 488, 384, 392, 416, 424, // column 3 - 450, 458, 482, 490, 386, 394, 418, 426, - 452, 460, 484, 492, 388, 396, 420, 428, - 454, 462, 486, 494, 390, 398, 422, 430 }, - { 464, 472, 496, 504, 400, 408, 432, 440, - 466, 474, 498, 506, 402, 410, 434, 442, - 468, 476, 500, 508, 404, 412, 436, 444, - 470, 478, 502, 510, 406, 414, 438, 446 }, - { 385, 393, 417, 425, 449, 457, 481, 489, - 387, 395, 419, 427, 451, 459, 483, 491, - 389, 397, 421, 429, 453, 461, 485, 493, - 391, 399, 423, 431, 455, 463, 487, 495 }, - { 401, 409, 433, 441, 465, 473, 497, 505, - 403, 411, 435, 443, 467, 475, 499, 507, - 405, 413, 437, 445, 469, 477, 501, 509, - 407, 415, 439, 447, 471, 479, 503, 511 }, -}; - -u32 g_pageTable32[32][64]; -u32 g_pageTable32Z[32][64]; -u32 g_pageTable16[64][64]; -u32 g_pageTable16S[64][64]; -u32 g_pageTable16Z[64][64]; -u32 g_pageTable16SZ[64][64]; -u32 g_pageTable8[64][128]; -u32 g_pageTable4[128][128]; +#include "Mem_Transmit.h" +#include "Mem_Swizzle.h" BLOCK m_Blocks[0x40]; // do so blocks are indexable -static PCSX2_ALIGNED16(u32 tempblock[64]); -#define DSTPSM gs.dstbuf.psm - -#define START_HOSTLOCAL() \ - assert( gs.imageTransfer == 0 ); \ - u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; \ - \ - /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ \ - int i = gs.imageY, j = gs.imageX; \ - -#define END_HOSTLOCAL() \ -End: \ - if( i >= gs.imageEndY ) { \ - assert( gs.imageTransfer == -1 || i == gs.imageEndY ); \ - gs.imageTransfer = -1; \ - /*int start, end; \ - ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); \ - ZeroGS::g_MemTargs.ClearRange(start, end);*/ \ - } \ - else { \ - /* update new params */ \ - gs.imageY = i; \ - gs.imageX = j; \ - } \ - -// transfers whole rows -#define TRANSMIT_HOSTLOCAL_Y_(psm, T, widthlimit, endY) { \ - assert( (nSize%widthlimit) == 0 && widthlimit <= 4 ); \ - if( (gs.imageEndX-gs.trxpos.dx)%widthlimit ) { \ - /*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \ - for(; i < endY; ++i) { \ - for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 1) { \ - /* write as many pixel at one time as possible */ \ - writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \ - } \ - } \ - } \ - for(; i < endY; ++i) { \ - for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += widthlimit) { \ - /* write as many pixel at one time as possible */ \ - if( nSize < widthlimit ) goto End; \ - writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \ - \ - if( widthlimit > 1 ) { \ - writePixel##psm##_0(pstart, (j+1)%2048, i%2048, pbuf[1], gs.dstbuf.bw); \ - \ - if( widthlimit > 2 ) { \ - writePixel##psm##_0(pstart, (j+2)%2048, i%2048, pbuf[2], gs.dstbuf.bw); \ - \ - if( widthlimit > 3 ) { \ - writePixel##psm##_0(pstart, (j+3)%2048, i%2048, pbuf[3], gs.dstbuf.bw); \ - } \ - } \ - } \ - } \ - \ - if( j >= gs.imageEndX ) { assert(j == gs.imageEndX); j = gs.trxpos.dx; } \ - else { assert( gs.imageTransfer == -1 || nSize*sizeof(T)/4 == 0 ); goto End; } \ - } \ -} \ - -// transmit until endX, don't check size since it has already been prevalidated -#define TRANSMIT_HOSTLOCAL_X_(psm, T, widthlimit, blockheight, startX) { \ - for(int tempi = 0; tempi < blockheight; ++tempi) { \ - for(j = startX; j < gs.imageEndX; j++, pbuf++) { \ - writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0], gs.dstbuf.bw); \ - } \ - pbuf += pitch-fracX; \ - } \ -} \ - -// transfers whole rows -#define TRANSMIT_HOSTLOCAL_Y_24(psm, T, widthlimit, endY) { \ - if( widthlimit != 8 || ((gs.imageEndX-gs.trxpos.dx)%widthlimit) ) { \ - /*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \ - for(; i < endY; ++i) { \ - for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 3) { \ - writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf), gs.dstbuf.bw); \ - } \ - \ - if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \ - else { assert( gs.imageTransfer == -1 || nSize == 0 ); goto End; } \ - } \ - } \ - else { \ - assert( /*(nSize%widthlimit) == 0 &&*/ widthlimit == 8 ); \ - for(; i < endY; ++i) { \ - for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += 3*widthlimit) { \ - if( nSize < widthlimit ) goto End; \ - /* write as many pixel at one time as possible */ \ - writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf+0), gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *(u32*)(pbuf+3), gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *(u32*)(pbuf+6), gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *(u32*)(pbuf+9), gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *(u32*)(pbuf+12), gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *(u32*)(pbuf+15), gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *(u32*)(pbuf+18), gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *(u32*)(pbuf+21), gs.dstbuf.bw); \ - } \ - \ - if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \ - else { \ - if( nSize < 0 ) { \ - /* extracted too much */ \ - assert( (nSize%3)==0 && nSize > -24 ); \ - j += nSize/3; \ - nSize = 0; \ - } \ - assert( gs.imageTransfer == -1 || nSize == 0 ); \ - goto End; \ - } \ - } \ - } \ -} \ - -// transmit until endX, don't check size since it has already been prevalidated -#define TRANSMIT_HOSTLOCAL_X_24(psm, T, widthlimit, blockheight, startX) { \ - for(int tempi = 0; tempi < blockheight; ++tempi) { \ - for(j = startX; j < gs.imageEndX; j++, pbuf += 3) { \ - writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, *(u32*)pbuf, gs.dstbuf.bw); \ - } \ - pbuf += 3*(pitch-fracX); \ - } \ -} \ - -// meant for 4bit transfers -#define TRANSMIT_HOSTLOCAL_Y_4(psm, T, widthlimit, endY) { \ - for(; i < endY; ++i) { \ - for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit) { \ - /* write as many pixel at one time as possible */ \ - writePixel##psm##_0(pstart, j%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ - pbuf++; \ - if( widthlimit > 2 ) { \ - writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ - pbuf++; \ - \ - if( widthlimit > 4 ) { \ - writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ - pbuf++; \ - \ - if( widthlimit > 6 ) { \ - writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ - pbuf++; \ - } \ - } \ - } \ - } \ - \ - if( j >= gs.imageEndX ) { j = gs.trxpos.dx; } \ - else { assert( gs.imageTransfer == -1 || (nSize/32) == 0 ); goto End; } \ - } \ -} \ - -// transmit until endX, don't check size since it has already been prevalidated -#define TRANSMIT_HOSTLOCAL_X_4(psm, T, widthlimit, blockheight, startX) { \ - for(int tempi = 0; tempi < blockheight; ++tempi) { \ - for(j = startX; j < gs.imageEndX; j+=2, pbuf++) { \ - writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0]&0x0f, gs.dstbuf.bw); \ - writePixel##psm##_0(pstart, (j+1)%2048, (i+tempi)%2048, pbuf[0]>>4, gs.dstbuf.bw); \ - } \ - pbuf += (pitch-fracX)/2; \ - } \ -} \ - -// calculate pitch in source buffer -#define TRANSMIT_PITCH_(pitch, T) (pitch*sizeof(T)) -#define TRANSMIT_PITCH_24(pitch, T) (pitch*3) -#define TRANSMIT_PITCH_4(pitch, T) (pitch/2) - -// special swizzle macros -#define SwizzleBlock24(dst, src, pitch) { \ - u8* pnewsrc = src; \ - u32* pblock = tempblock; \ - \ - for(int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch-24) { \ - for(int bx = 0; bx < 8; ++bx, pnewsrc += 3) { \ - pblock[bx] = *(u32*)pnewsrc; \ - } \ - } \ - for(int bx = 0; bx < 7; ++bx, pnewsrc += 3) { \ - /* might be 1 byte out of bounds of GS memory */ \ - pblock[bx] = *(u32*)pnewsrc; \ - } \ - /* do 3 bytes for the last copy */ \ - *((u8*)pblock+28) = pnewsrc[0]; \ - *((u8*)pblock+29) = pnewsrc[1]; \ - *((u8*)pblock+30) = pnewsrc[2]; \ - SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff); \ -} \ - -#define SwizzleBlock24u SwizzleBlock24 - -#define SwizzleBlock8H(dst, src, pitch) { \ - u8* pnewsrc = src; \ - u32* pblock = tempblock; \ - \ - for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) { \ - u32 u = *(u32*)pnewsrc; \ - pblock[0] = u<<24; \ - pblock[1] = u<<16; \ - pblock[2] = u<<8; \ - pblock[3] = u; \ - u = *(u32*)(pnewsrc+4); \ - pblock[4] = u<<24; \ - pblock[5] = u<<16; \ - pblock[6] = u<<8; \ - pblock[7] = u; \ - } \ - SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xff000000); \ -} \ - -#define SwizzleBlock8Hu SwizzleBlock8H - -#define SwizzleBlock4HH(dst, src, pitch) { \ - u8* pnewsrc = src; \ - u32* pblock = tempblock; \ - \ - for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) { \ - u32 u = *(u32*)pnewsrc; \ - pblock[0] = u<<28; \ - pblock[1] = u<<24; \ - pblock[2] = u<<20; \ - pblock[3] = u<<16; \ - pblock[4] = u<<12; \ - pblock[5] = u<<8; \ - pblock[6] = u<<4; \ - pblock[7] = u; \ - } \ - SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xf0000000); \ -} \ - -#define SwizzleBlock4HHu SwizzleBlock4HH - -#define SwizzleBlock4HL(dst, src, pitch) { \ - u8* pnewsrc = src; \ - u32* pblock = tempblock; \ - \ - for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) { \ - u32 u = *(u32*)pnewsrc; \ - pblock[0] = u<<24; \ - pblock[1] = u<<20; \ - pblock[2] = u<<16; \ - pblock[3] = u<<12; \ - pblock[4] = u<<8; \ - pblock[5] = u<<4; \ - pblock[6] = u; \ - pblock[7] = u>>4; \ - } \ - SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000); \ -} \ - -#define SwizzleBlock4HLu SwizzleBlock4HL +PCSX2_ALIGNED16(u32 tempblock[64]); // ------------------------ // | Y | @@ -510,11 +41,15 @@ End: \ #define DEFINE_TRANSFERLOCAL(psm, T, widthlimit, blockbits, blockwidth, blockheight, TransSfx, SwizzleBlock) \ int TransferHostLocal##psm(const void* pbyMem, u32 nQWordSize) \ { \ - START_HOSTLOCAL(); \ + assert( gs.imageTransfer == 0 ); \ + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; \ + \ + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ \ + int i = gs.imageY, j = gs.imageX; \ \ const T* pbuf = (const T*)pbyMem; \ - int nLeftOver = (nQWordSize*4*2)%(TRANSMIT_PITCH##TransSfx(2, T)); \ - int nSize = nQWordSize*4*2/TRANSMIT_PITCH##TransSfx(2, T); \ + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch##TransSfx(2)); \ + int nSize = nQWordSize*4*2/TransmitPitch##TransSfx(2); \ nSize = min(nSize, gs.imageWnew * gs.imageHnew); \ \ int pitch, area, fracX; \ @@ -544,10 +79,10 @@ int TransferHostLocal##psm(const void* pbyMem, u32 nQWordSize) \ \ if( ((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit) ) { \ /* transmit with a width of 1 */ \ - TRANSMIT_HOSTLOCAL_Y##TransSfx(psm, T, (1+(DSTPSM == 0x14)), endY); \ + TRANSMIT_HOSTLOCAL_Y(TransSfx,psm, T, (1+(DSTPSM == 0x14)), endY); \ } \ else { \ - TRANSMIT_HOSTLOCAL_Y##TransSfx(psm, T, widthlimit, endY); \ + TRANSMIT_HOSTLOCAL_Y(TransSfx,psm, T, widthlimit, endY); \ } \ \ if( nSize == 0 || i == gs.imageEndY ) \ @@ -561,44 +96,1710 @@ int TransferHostLocal##psm(const void* pbyMem, u32 nQWordSize) \ area = pitch*blockheight; \ fracX = gs.imageEndX-alignedX; \ \ - /* on top of checking whether pbuf is alinged, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ \ - bAligned = !((uptr)pbuf & 0xf) && (TRANSMIT_PITCH##TransSfx(pitch, T)&0xf) == 0; \ + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ \ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch##TransSfx(pitch) & 0xf) == 0; \ \ /* transfer aligning to blocks */ \ for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) { \ \ if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) { \ - for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH##TransSfx(blockwidth, T)/sizeof(T)) { \ - SwizzleBlock(pstart + getPixelAddress##psm##_0(tempj, i, gs.dstbuf.bw)*blockbits/8, \ - (u8*)pbuf, TRANSMIT_PITCH##TransSfx(pitch, T)); \ + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch##TransSfx(blockwidth)/sizeof(T)) { \ + SwizzleBlock(pstart + getPixelAddress_0(psm,tempj, i, gs.dstbuf.bw)*blockbits/8, \ + (u8*)pbuf, TransmitPitch##TransSfx(pitch)); \ } \ } \ else { \ - for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH##TransSfx(blockwidth, T)/sizeof(T)) { \ - SwizzleBlock##u(pstart + getPixelAddress##psm##_0(tempj, i, gs.dstbuf.bw)*blockbits/8, \ - (u8*)pbuf, TRANSMIT_PITCH##TransSfx(pitch, T)); \ + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch##TransSfx(blockwidth)/sizeof(T)) { \ + SwizzleBlock##u(pstart + getPixelAddress_0(psm,tempj, i, gs.dstbuf.bw)*blockbits/8, \ + (u8*)pbuf, TransmitPitch##TransSfx(pitch)); \ } \ } \ \ /* transfer the rest */ \ if( alignedX < gs.imageEndX ) { \ - TRANSMIT_HOSTLOCAL_X##TransSfx(psm, T, widthlimit, blockheight, alignedX); \ - pbuf -= TRANSMIT_PITCH##TransSfx((alignedX-gs.trxpos.dx), T)/sizeof(T); \ + TRANSMIT_HOSTLOCAL_X(TransSfx,psm, T, widthlimit, blockheight, alignedX); \ + pbuf -= TransmitPitch##TransSfx(alignedX-gs.trxpos.dx)/sizeof(T); \ } \ - else pbuf += (blockheight-1)*TRANSMIT_PITCH##TransSfx(pitch, T)/sizeof(T); \ + else pbuf += (blockheight-1)*TransmitPitch##TransSfx(pitch)/sizeof(T); \ j = gs.trxpos.dx; \ } \ \ - if( TRANSMIT_PITCH##TransSfx(nSize, T)/4 > 0 ) { \ - TRANSMIT_HOSTLOCAL_Y##TransSfx(psm, T, widthlimit, gs.imageEndY); \ + if( TransmitPitch##TransSfx(nSize)/4 > 0 ) { \ + TRANSMIT_HOSTLOCAL_Y(TransSfx,psm, T, widthlimit, gs.imageEndY); \ /* sometimes wrong sizes are sent (tekken tag) */ \ - assert( gs.imageTransfer == -1 || TRANSMIT_PITCH##TransSfx(nSize,T)/4 <= 2 ); \ + assert( gs.imageTransfer == -1 || TransmitPitch##TransSfx(nSize)/4 <= 2 ); \ } \ \ - END_HOSTLOCAL(); \ - return (nSize * TRANSMIT_PITCH##TransSfx(2, T) + nLeftOver)/2; \ +End: \ + if( i >= gs.imageEndY ) { \ + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); \ + gs.imageTransfer = -1; \ + /*int start, end; \ + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); \ + ZeroGS::g_MemTargs.ClearRange(start, end);*/ \ + } \ + else { \ + /* update new params */ \ + gs.imageY = i; \ + gs.imageX = j; \ + } \ + return (nSize * TransmitPitch##TransSfx(2) + nLeftOver)/2; \ } \ +//#define NEW_TRANSFER +#ifdef NEW_TRANSFER + +//DEFINE_TRANSFERLOCAL(32, u32, 2, 32, 8, 8, _, SwizzleBlock32); +int TransferHostLocal32(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 2; + const u32 blockbits = 32; + const u32 blockwidth = 8; + const u32 blockheight = 8; + const u32 TSize = sizeof(u32); +// _SwizzleBlock swizzle; + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u32* pbuf = (const u32*)pbyMem; + const int tp2 = TransmitPitch_(2); + int nLeftOver = (nQWordSize*4*2)%tp2; + int nSize = (nQWordSize*4*2)/tp2; + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(32, u32, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(32, u32, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch) & 0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + + if ( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL))) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + u8 *temp = pstart + getPixelAddress_0(32, tempj, i, gs.dstbuf.bw)*blockbits/8; + SwizzleBlock32(temp, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + u8 *temp = pstart + getPixelAddress_0(32, tempj, i, gs.dstbuf.bw)*blockbits/8; + SwizzleBlock32u(temp, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + +// if ( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL))) +// { +// swizzle = SwizzleBlock32; +// } +// else +// { +// swizzle = SwizzleBlock32u; +// } +// +// for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) +// { +// u8 *temp = pstart + getPixelAddress_0(32, tempj, i, gs.dstbuf.bw)*blockbits/8; +// swizzle(temp, (u8*)pbuf, TransmitPitch_(pitch), 0xffffffff); +// } + + /* transfer the rest */ + if( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_(32, u32, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_((alignedX-gs.trxpos.dx))/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/TSize; + } + + j = gs.trxpos.dx; + } + + if ( TransmitPitch_(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_(32, u32, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + +End: + if( i >= gs.imageEndY ) + { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else + { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(32Z, u32, 2, 32, 8, 8, _, SwizzleBlock32); +int TransferHostLocal32Z(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 2; + const u32 blockbits = 32; + const u32 blockwidth = 8; + const u32 blockheight = 8; + const u32 TSize = sizeof(u32); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u32* pbuf = (const u32*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_(2)); + int nSize = nQWordSize*4*2/TransmitPitch_(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(32Z, u32, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(32Z, u32, widthlimit, endY); + } + + if ( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/sizeof(u32)) + { + SwizzleBlock32(pstart + getPixelAddress_0(32Z,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/sizeof(u32)) + { + SwizzleBlock32u(pstart + getPixelAddress_0(32Z,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_( 32Z, u32, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_(alignedX - gs.trxpos.dx)/sizeof(u32); + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/sizeof(u32); + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_( 32Z, u32, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(24, u8, 8, 32, 8, 8, _24, SwizzleBlock24); +int TransferHostLocal24(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 8; + const u32 blockbits = 32; + const u32 blockwidth = 8; + const u32 blockheight = 8; + const u32 TSize = sizeof(u8); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u8* pbuf = (const u8*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_24(2)); + int nSize = nQWordSize*4*2/TransmitPitch_24(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_24(24, T, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_24(24, T, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_24(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24(blockwidth)/sizeof(u8)) + { + SwizzleBlock24(pstart + getPixelAddress_0(24,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_24(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24(blockwidth)/sizeof(u8)) + { + SwizzleBlock24u(pstart + getPixelAddress_0(24,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_24(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_24(24, T, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_24((alignedX-gs.trxpos.dx))/sizeof(u8); + } + else + { + pbuf += (blockheight-1)*TransmitPitch_24(pitch)/sizeof(u8); + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_24(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_24(24, u8, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_24(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_24(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(24Z, u8, 8, 32, 8, 8, _24, SwizzleBlock24); +int TransferHostLocal24Z(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 8; + const u32 blockbits = 32; + const u32 blockwidth = 8; + const u32 blockheight = 8; + const u32 TSize = sizeof(u8); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u8* pbuf = (const u8*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_24(2)); + int nSize = nQWordSize*4*2/TransmitPitch_24(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_24(16, u8, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_24(16, u8, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_24(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24(blockwidth)/sizeof(u8)) + { + SwizzleBlock24(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_24(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_24(blockwidth)/sizeof(u8)) + { + SwizzleBlock24u(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_24(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_24(16, u8, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_24(alignedX-gs.trxpos.dx)/sizeof(u8); + } + else + { + pbuf += (blockheight-1)*TransmitPitch_24(pitch)/sizeof(u8); + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_24(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_24(24, u8, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_24(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_24(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(16, u16, 4, 16, 16, 8, _, SwizzleBlock16); +int TransferHostLocal16(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 4; + const u32 blockbits = 16; + const u32 blockwidth = 16; + const u32 blockheight = 8; + const u32 TSize = sizeof(u16); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u16* pbuf = (const u16*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_(2)); + int nSize = nQWordSize*4*2/TransmitPitch_(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(16, u16, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(16, u16, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/sizeof(u16)) + { + SwizzleBlock16(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/sizeof(u16)) + { + SwizzleBlock16u(pstart + getPixelAddress_0(16,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_(16, T, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_((alignedX-gs.trxpos.dx))/sizeof(u16); + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/sizeof(u16); + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_(16, u16, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(16S, u16, 4, 16, 16, 8, _, SwizzleBlock16); +int TransferHostLocal16S(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 4; + const u32 blockbits = 16; + const u32 blockwidth = 16; + const u32 blockheight = 8; + const u32 TSize = sizeof(u16); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u16* pbuf = (const u16*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_(2)); + int nSize = nQWordSize*4*2/TransmitPitch_(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(16S, u16, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(16S, u16, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock16(pstart + getPixelAddress_0(16S,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock16u(pstart + getPixelAddress_0(16S,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_(16S, u16, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_((alignedX-gs.trxpos.dx))/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_(16S, u16, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(16Z, u16, 4, 16, 16, 8, _, SwizzleBlock16); +int TransferHostLocal16Z(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 4; + const u32 blockbits = 16; + const u32 blockwidth = 16; + const u32 blockheight = 8; + const u32 TSize = sizeof(u16); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u16* pbuf = (const u16*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_(2)); + int nSize = nQWordSize*4*2/TransmitPitch_(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(16Z, u16, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(16Z, u16, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock16(pstart + getPixelAddress_0(16Z,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock16u(pstart + getPixelAddress_0(16Z,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_(16Z, T, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_(alignedX-gs.trxpos.dx)/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_(16Z, u16, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(16SZ, u16, 4, 16, 16, 8, _, SwizzleBlock16); +int TransferHostLocal16SZ(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 4; + const u32 blockbits = 16; + const u32 blockwidth = 16; + const u32 blockheight = 8; + const u32 TSize = sizeof(u16); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u16* pbuf = (const u16*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_(2)); + int nSize = nQWordSize*4*2/TransmitPitch_(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(16SZ, u16, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(16SZ, u16, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock16(pstart + getPixelAddress_0(16SZ,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock16u(pstart + getPixelAddress_0(16SZ,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_(16SZ, u16, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_(alignedX-gs.trxpos.dx)/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_(16SZ, u16, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(8, u8, 4, 8, 16, 16, _, SwizzleBlock8); +int TransferHostLocal8(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 4; + const u32 blockbits = 8; + const u32 blockwidth = 16; + const u32 blockheight = 16; + const u32 TSize = sizeof(u8); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u8* pbuf = (const u8*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_(2)); + int nSize = nQWordSize*4*2/TransmitPitch_(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(8, u8, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(8, u8, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock8(pstart + getPixelAddress_0(8,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock8u(pstart + getPixelAddress_0(8,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_(8, u8, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_(alignedX-gs.trxpos.dx)/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TRANSMIT_PITCH_(nSize, u8)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_(8, u8, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(4, u8, 8, 4, 32, 16, _4, SwizzleBlock4); +int TransferHostLocal4(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 8; + const u32 blockbits = 4; + const u32 blockwidth = 32; + const u32 blockheight = 16; + const u32 TSize = sizeof(u8); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u8* pbuf = (const u8*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_4(2)); + int nSize = nQWordSize*4*2/TransmitPitch_4(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_4(4, u8, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_4(4, u8, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_4(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4(blockwidth)/TSize) + { + SwizzleBlock4(pstart + getPixelAddress_0(4,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4(blockwidth)/TSize) + { + SwizzleBlock4u(pstart + getPixelAddress_0(4,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_4(4, u8, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_4(alignedX-gs.trxpos.dx)/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_4(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_4(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_4(4, u8, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_4(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_4(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(8H, u8, 4, 32, 8, 8, _, SwizzleBlock8H); +int TransferHostLocal8H(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 4; + const u32 blockbits = 32; + const u32 blockwidth = 8; + const u32 blockheight = 8; + const u32 TSize = sizeof(u8); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u8* pbuf = (const u8*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_(2)); + int nSize = nQWordSize*4*2/TransmitPitch_(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_(8H, u8, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_(8H, u8, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock8H(pstart + getPixelAddress_0(8H,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_(blockwidth)/TSize) + { + SwizzleBlock8Hu(pstart + getPixelAddress_0(8H,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_(8H, u8, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_(alignedX-gs.trxpos.dx)/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TRANSMIT_PITCH_(nSize, u8)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_(8H, u8, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(4HL, u8, 8, 32, 8, 8, _4, SwizzleBlock4HL); +int TransferHostLocal4HL(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 8; + const u32 blockbits = 32; + const u32 blockwidth = 8; + const u32 blockheight = 8; + const u32 TSize = sizeof(u8); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u8* pbuf = (const u8*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_4(2)); + int nSize = nQWordSize*4*2/TransmitPitch_4(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_4(4HL, u8, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_4(4HL, u8, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_4(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4(blockwidth)/TSize) + { + SwizzleBlock4HL(pstart + getPixelAddress_0(4HL,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4(blockwidth)/TSize) + { + SwizzleBlock4HLu(pstart + getPixelAddress_0(4HL,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_4(4HL, u8, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_4(alignedX-gs.trxpos.dx)/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_4(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_4(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_4(4HL, u8, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_4(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_4(2) + nLeftOver)/2; +} + +//DEFINE_TRANSFERLOCAL(4HH, u8, 8, 32, 8, 8, _4, SwizzleBlock4HH); +int TransferHostLocal4HH(const void* pbyMem, u32 nQWordSize) +{ + const u32 widthlimit = 8; + const u32 blockbits = 32; + const u32 blockwidth = 8; + const u32 blockheight = 8; + const u32 TSize = sizeof(u8); + + assert( gs.imageTransfer == 0 ); + u8* pstart = g_pbyGSMemory + gs.dstbuf.bp*256; + + /*const u8* pendbuf = (const u8*)pbyMem + nQWordSize*4;*/ + int i = gs.imageY, j = gs.imageX; + + const u8* pbuf = (const u8*)pbyMem; + int nLeftOver = (nQWordSize*4*2)%(TransmitPitch_4(2)); + int nSize = nQWordSize*4*2/TransmitPitch_4(2); + nSize = min(nSize, gs.imageWnew * gs.imageHnew); + + int pitch, area, fracX; + int endY = ROUND_UPPOW2(i, blockheight); + int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); + int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); + bool bAligned, bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; + + if ((gs.imageEndX-gs.trxpos.dx)%widthlimit) + { + /* hack */ + int testwidth = (int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx); + if ((testwidth <= widthlimit) && (testwidth >= -widthlimit)) + { + /* don't transfer */ + /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ + gs.imageTransfer = -1; + } + bCanAlign = false; + } + + /* first align on block boundary */ + if ( MOD_POW2(i, blockheight) || !bCanAlign ) + { + + if ( !bCanAlign ) + endY = gs.imageEndY; /* transfer the whole image */ + else + assert( endY < gs.imageEndY); /* part of alignment condition */ + + if (((gs.imageEndX-gs.trxpos.dx)%widthlimit) || ((gs.imageEndX-j)%widthlimit)) + { + /* transmit with a width of 1 */ + TRANSMIT_HOSTLOCAL_Y_4(4HH, u8, (1+(DSTPSM == 0x14)), endY); + } + else + { + TRANSMIT_HOSTLOCAL_Y_4(4HH, u8, widthlimit, endY); + } + + if( nSize == 0 || i == gs.imageEndY ) goto End; + } + + assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); + + /* can align! */ + pitch = gs.imageEndX-gs.trxpos.dx; + area = pitch*blockheight; + fracX = gs.imageEndX-alignedX; + + /* on top of checking whether pbuf is aligned, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ + bAligned = !((uptr)pbuf & 0xf) && (TransmitPitch_4(pitch)&0xf) == 0; + + /* transfer aligning to blocks */ + for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) + { + if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4(blockwidth)/TSize) + { + SwizzleBlock4HH(pstart + getPixelAddress_0(4HH,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4(pitch)); + } + } + else + { + for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TransmitPitch_4(blockwidth)/TSize) + { + SwizzleBlock4HHu(pstart + getPixelAddress_0(4HH,tempj, i, gs.dstbuf.bw)*blockbits/8, (u8*)pbuf, TransmitPitch_4(pitch)); + } + } + + /* transfer the rest */ + if ( alignedX < gs.imageEndX ) + { + TRANSMIT_HOSTLOCAL_X_4(4HH,u8, widthlimit, blockheight, alignedX); + pbuf -= TransmitPitch_4(alignedX-gs.trxpos.dx)/TSize; + } + else + { + pbuf += (blockheight-1)*TransmitPitch_4(pitch)/TSize; + } + j = gs.trxpos.dx; + } + + if (TransmitPitch_4(nSize)/4 > 0 ) + { + TRANSMIT_HOSTLOCAL_Y_4(4HH, u8, widthlimit, gs.imageEndY); + /* sometimes wrong sizes are sent (tekken tag) */ + assert( gs.imageTransfer == -1 || TransmitPitch_4(nSize)/4 <= 2 ); + } + + End: + if( i >= gs.imageEndY ) { + assert( gs.imageTransfer == -1 || i == gs.imageEndY ); + gs.imageTransfer = -1; + /*int start, end; + ZeroGS::GetRectMemAddress(start, end, gs.dstbuf.psm, gs.trxpos.dx, gs.trxpos.dy, gs.imageWnew, gs.imageHnew, gs.dstbuf.bp, gs.dstbuf.bw); + ZeroGS::g_MemTargs.ClearRange(start, end);*/ + } + else { + /* update new params */ + gs.imageY = i; + gs.imageX = j; + } + return (nSize * TransmitPitch_4(2) + nLeftOver)/2; +} +#else + DEFINE_TRANSFERLOCAL(32, u32, 2, 32, 8, 8, _, SwizzleBlock32); DEFINE_TRANSFERLOCAL(32Z, u32, 2, 32, 8, 8, _, SwizzleBlock32); DEFINE_TRANSFERLOCAL(24, u8, 8, 32, 8, 8, _24, SwizzleBlock24); @@ -613,98 +1814,7 @@ DEFINE_TRANSFERLOCAL(8H, u8, 4, 32, 8, 8, _, SwizzleBlock8H); DEFINE_TRANSFERLOCAL(4HL, u8, 8, 32, 8, 8, _4, SwizzleBlock4HL); DEFINE_TRANSFERLOCAL(4HH, u8, 8, 32, 8, 8, _4, SwizzleBlock4HH); -//#define T u8 -//#define widthlimit 8 -//#define blockbits 4 -//#define blockwidth 32 -//#define blockheight 16 -// -//void TransferHostLocal4(const void* pbyMem, u32 nQWordSize) -//{ -// START_HOSTLOCAL(); -// -// const T* pbuf = (const T*)pbyMem; -// u32 nSize = nQWordSize*16*2/TRANSMIT_PITCH_4(2, T); -// nSize = min(nSize, gs.imageWnew * gs.imageHnew); -// -// int endY = ROUND_UPPOW2(i, blockheight); -// int alignedY = ROUND_DOWNPOW2(gs.imageEndY, blockheight); -// int alignedX = ROUND_DOWNPOW2(gs.imageEndX, blockwidth); -// bool bCanAlign = MOD_POW2(gs.trxpos.dx, blockwidth) == 0 && (j == gs.trxpos.dx) && (alignedY > endY) && alignedX > gs.trxpos.dx; -// -// if( (gs.imageEndX-gs.trxpos.dx)%widthlimit ) { -// /* hack */ -// if( abs((int)nSize - (gs.imageEndY-i)*(gs.imageEndX-gs.trxpos.dx)+(j-gs.trxpos.dx)) <= widthlimit ) { -// /* don't transfer */ -// /*DEBUG_LOG("bad texture %s: %d %d %d\n", #psm, gs.trxpos.dx, gs.imageEndX, nQWordSize);*/ -// gs.imageTransfer = -1; -// } -// bCanAlign = false; -// } -// -// /* first align on block boundary */ -// if( MOD_POW2(i, blockheight) || !bCanAlign ) { -// -// if( !bCanAlign ) -// endY = gs.imageEndY; /* transfer the whole image */ -// else -// assert( endY < gs.imageEndY); /* part of alignment condition */ -// -// if( (DSTPSM == 0x13 || DSTPSM == 0x14) && ((gs.imageEndX-gs.trxpos.dx)%widthlimit) ) { -// /* transmit with a width of 1 */ -// TRANSMIT_HOSTLOCAL_Y_4(4, T, (1+(DSTPSM == 0x14)), endY); -// } -// else { -// TRANSMIT_HOSTLOCAL_Y_4(4, T, widthlimit, endY); -// } -// -// if( nSize == 0 || i == gs.imageEndY ) -// goto End; -// } -// -// assert( MOD_POW2(i, blockheight) == 0 && j == gs.trxpos.dx); -// -// /* can align! */ -// int pitch = gs.imageEndX-gs.trxpos.dx; -// u32 area = pitch*blockheight; -// int fracX = gs.imageEndX-alignedX; -// -// /* on top of checking whether pbuf is alinged, make sure that the width is at least aligned to its limits (due to bugs in pcsx2) */ -// bool bAligned = !((u32)pbuf & 0xf) && (TRANSMIT_PITCH_4(pitch, T)&0xf) == 0; -// -// /* transfer aligning to blocks */ -// for(; i < alignedY && nSize >= area; i += blockheight, nSize -= area) { -// -// if( bAligned || ((DSTPSM==PSMCT24) || (DSTPSM==PSMT8H) || (DSTPSM==PSMT4HH) || (DSTPSM==PSMT4HL)) ) { -// for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH_4(blockwidth, T)/sizeof(T)) { -// SwizzleBlock4(pstart + getPixelAddress4_0(tempj, i, gs.dstbuf.bw)*blockbits/8, -// (u8*)pbuf, TRANSMIT_PITCH_4(pitch, T)); -// } -// } -// else { -// for(int tempj = gs.trxpos.dx; tempj < alignedX; tempj += blockwidth, pbuf += TRANSMIT_PITCH_4(blockwidth, T)/sizeof(T)) { -// SwizzleBlock4u(pstart + getPixelAddress4_0(tempj, i, gs.dstbuf.bw)*blockbits/8, -// (u8*)pbuf, TRANSMIT_PITCH_4(pitch, T)); -// } -// } -// -// /* transfer the rest */ -// if( alignedX < gs.imageEndX ) { -// TRANSMIT_HOSTLOCAL_X_4(4, T, widthlimit, blockheight, alignedX); -// pbuf -= TRANSMIT_PITCH_4((alignedX-gs.trxpos.dx), T)/sizeof(T); -// } -// else pbuf += (blockheight-1)*TRANSMIT_PITCH_4(pitch, T)/sizeof(T); -// j = 0; -// } -// -// if( TRANSMIT_PITCH_4(nSize, T)/4 > 0 ) { -// TRANSMIT_HOSTLOCAL_Y_4(4, T, widthlimit, gs.imageEndY); -// /* sometimes wrong sizes are sent (tekken tag) */ -// assert( gs.imageTransfer == -1 || TRANSMIT_PITCH_4(nSize,T)/4 <= 2 ); -// } -// -// END_HOSTLOCAL(); -//} +#endif void TransferLocalHost32(void* pbyMem, u32 nQWordSize) {FUNCLOG diff --git a/plugins/zzogl-pg/opengl/Mem.h b/plugins/zzogl-pg/opengl/Mem.h index 63317313a8..d29fa93ca0 100644 --- a/plugins/zzogl-pg/opengl/Mem.h +++ b/plugins/zzogl-pg/opengl/Mem.h @@ -23,14 +23,26 @@ #include // works only when base is a power of 2 -#define ROUND_UPPOW2(val, base) (((val)+(base-1))&~(base-1)) -#define ROUND_DOWNPOW2(val, base) ((val)&~(base-1)) -#define MOD_POW2(val, base) ((val)&(base-1)) +static __forceinline int ROUND_UPPOW2(int val, int base) { return (((val)+(base-1))&~(base-1)); } +static __forceinline int ROUND_DOWNPOW2(int val, int base) { return ((val)&~(base-1)); } +static __forceinline int MOD_POW2(int val, int base) { return ((val)&(base-1)); } // d3d texture dims -#define BLOCK_TEXWIDTH 128 -#define BLOCK_TEXHEIGHT 512 +const int BLOCK_TEXWIDTH = 128; +const int BLOCK_TEXHEIGHT = 512; +extern PCSX2_ALIGNED16(u32 tempblock[64]); + + +typedef u32 ( *_getPixelAddress)(int x, int y, u32 bp, u32 bw); +typedef u32 (*_getPixelAddress_0)(int x, int y, u32 bw); +typedef void (*_writePixel)(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw); +typedef void (*_writePixel_0)(void* pmem, int x, int y, u32 pixel, u32 bw); +typedef u32 (*_readPixel)(const void* pmem, int x, int y, u32 bp, u32 bw); +typedef u32 (*_readPixel_0)(const void* pmem, int x, int y, u32 bw); +typedef int (*_TransferHostLocal)(const void* pbyMem, u32 nQWordSize); +typedef void (*_TransferLocalHost)(void* pbyMem, u32 nQWordSize); +typedef void (__fastcall *_SwizzleBlock)(u8 *dst, u8 *src, int pitch, u32 WriteMask); // rest not visible externally struct BLOCK { @@ -46,14 +58,14 @@ struct BLOCK u32* blockTable; u32* columnTable; - u32 (*getPixelAddress)(int x, int y, u32 bp, u32 bw); - u32 (*getPixelAddress_0)(int x, int y, u32 bw); - void (*writePixel)(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw); - void (*writePixel_0)(void* pmem, int x, int y, u32 pixel, u32 bw); - u32 (*readPixel)(const void* pmem, int x, int y, u32 bp, u32 bw); - u32 (*readPixel_0)(const void* pmem, int x, int y, u32 bw); - int (*TransferHostLocal)(const void* pbyMem, u32 nQWordSize); - void (*TransferLocalHost)(void* pbyMem, u32 nQWordSize); + _getPixelAddress getPixelAddress; + _getPixelAddress_0 getPixelAddress_0; + _writePixel writePixel; + _writePixel_0 writePixel_0; + _readPixel readPixel; + _readPixel_0 readPixel_0; + _TransferHostLocal TransferHostLocal; + _TransferLocalHost TransferLocalHost; // texture must be of dims BLOCK_TEXWIDTH and BLOCK_TEXHEIGHT static void FillBlocks(std::vector& vBlockData, std::vector& vBilinearData, int floatfmt); @@ -84,19 +96,17 @@ extern u32 g_pageTable16SZ[64][64]; extern u32 g_pageTable8[64][128]; extern u32 g_pageTable4[128][128]; -static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress32(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>5) * (bw>>6)) + (x>>6); u32 word = bp * 64 + basepage * 2048 + g_pageTable32[y&31][x&63]; - //assert (word < 0x100000); - //word = min(word, 0xfffff); return word; } -static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>5) * (bw>>6)) + (x>>6); u32 word = basepage * 2048 + g_pageTable32[y&31][x&63]; - //assert (word < 0x100000); - //word = min(word, 0xfffff); return word; } @@ -109,210 +119,221 @@ static __forceinline u32 getPixelAddress32_0(int x, int y, u32 bw) { #define getPixelAddress4HH getPixelAddress32 #define getPixelAddress4HH_0 getPixelAddress32_0 -static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress16(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = bp * 128 + basepage * 4096 + g_pageTable16[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress16_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = basepage * 4096 + g_pageTable16[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress16S(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = bp * 128 + basepage * 4096 + g_pageTable16S[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress16S_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = basepage * 4096 + g_pageTable16S[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress8(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>6) * ((bw+127)>>7)) + (x>>7); u32 word = bp * 256 + basepage * 8192 + g_pageTable8[y&63][x&127]; - //assert (word < 0x400000); - //word = min(word, 0x3fffff); return word; } -static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress8_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>6) * ((bw+127)>>7)) + (x>>7); u32 word = basepage * 8192 + g_pageTable8[y&63][x&127]; - //assert (word < 0x400000); - //word = min(word, 0x3fffff); return word; } -static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress4(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>7) * ((bw+127)>>7)) + (x>>7); u32 word = bp * 512 + basepage * 16384 + g_pageTable4[y&127][x&127]; - //assert (word < 0x800000); - //word = min(word, 0x7fffff); return word; } -static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress4_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>7) * ((bw+127)>>7)) + (x>>7); u32 word = basepage * 16384 + g_pageTable4[y&127][x&127]; - //assert (word < 0x800000); - //word = min(word, 0x7fffff); return word; } -static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress32Z(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>5) * (bw>>6)) + (x>>6); u32 word = bp * 64 + basepage * 2048 + g_pageTable32Z[y&31][x&63]; - //assert (word < 0x100000); - //word = min(word, 0xfffff); return word; } -static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress32Z_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>5) * (bw>>6)) + (x>>6); u32 word = basepage * 2048 + g_pageTable32Z[y&31][x&63]; - //assert (word < 0x100000); - //word = min(word, 0xfffff); return word; } #define getPixelAddress24Z getPixelAddress32Z #define getPixelAddress24Z_0 getPixelAddress32Z_0 -static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress16Z(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = bp * 128 + basepage * 4096 + g_pageTable16Z[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress16Z_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = basepage * 4096 + g_pageTable16Z[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw) { +static __forceinline u32 getPixelAddress16SZ(int x, int y, u32 bp, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = bp * 128 + basepage * 4096 + g_pageTable16SZ[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw) { +static __forceinline u32 getPixelAddress16SZ_0(int x, int y, u32 bw) +{ u32 basepage = ((y>>6) * (bw>>6)) + (x>>6); u32 word = basepage * 4096 + g_pageTable16SZ[y&63][x&63]; - //assert (word < 0x200000); - //word = min(word, 0x1fffff); return word; } -static __forceinline void writePixel32(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +#define getPixelAddress_0(psm,x,y,bw) getPixelAddress##psm##_0(x,y,bw) +#define getPixelAddress(psm,x,y,bp,bw) getPixelAddress##psm##(x,y,bp,bw) + + + +static __forceinline void writePixel32(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u32*)pmem)[getPixelAddress32(x, y, bp, bw)] = pixel; } -static __forceinline void writePixel24(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel24(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ u8 *buf = (u8*)&((u32*)pmem)[getPixelAddress32(x, y, bp, bw)]; u8 *pix = (u8*)&pixel; buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2]; } -static __forceinline void writePixel16(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel16(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u16*)pmem)[getPixelAddress16(x, y, bp, bw)] = pixel; } -static __forceinline void writePixel16S(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel16S(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u16*)pmem)[getPixelAddress16S(x, y, bp, bw)] = pixel; } -static __forceinline void writePixel8(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel8(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u8*)pmem)[getPixelAddress8(x, y, bp, bw)] = pixel; } -static __forceinline void writePixel8H(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel8H(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u8*)pmem)[4*getPixelAddress32(x, y, bp, bw)+3] = pixel; } -static __forceinline void writePixel4(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel4(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ u32 addr = getPixelAddress4(x, y, bp, bw); u8 pix = ((u8*)pmem)[addr/2]; if (addr & 0x1) ((u8*)pmem)[addr/2] = (pix & 0x0f) | (pixel << 4); else ((u8*)pmem)[addr/2] = (pix & 0xf0) | (pixel); } -static __forceinline void writePixel4HL(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel4HL(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ u8 *p = (u8*)pmem + 4*getPixelAddress4HL(x, y, bp, bw)+3; *p = (*p & 0xf0) | pixel; } -static __forceinline void writePixel4HH(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel4HH(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ u8 *p = (u8*)pmem + 4*getPixelAddress4HH(x, y, bp, bw)+3; *p = (*p & 0x0f) | (pixel<<4); } -static __forceinline void writePixel32Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel32Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u32*)pmem)[getPixelAddress32Z(x, y, bp, bw)] = pixel; } -static __forceinline void writePixel24Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel24Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ u8 *buf = (u8*)pmem + 4*getPixelAddress32Z(x, y, bp, bw); u8 *pix = (u8*)&pixel; buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2]; } -static __forceinline void writePixel16Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel16Z(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u16*)pmem)[getPixelAddress16Z(x, y, bp, bw)] = pixel; } -static __forceinline void writePixel16SZ(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { +static __forceinline void writePixel16SZ(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) +{ ((u16*)pmem)[getPixelAddress16SZ(x, y, bp, bw)] = pixel; } /////////////// -static __forceinline u32 readPixel32(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel32(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32(x, y, bp, bw)]; } -static __forceinline u32 readPixel24(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel24(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32(x, y, bp, bw)] & 0xffffff; } -static __forceinline u32 readPixel16(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel16(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16(x, y, bp, bw)]; } -static __forceinline u32 readPixel16S(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel16S(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16S(x, y, bp, bw)]; } -static __forceinline u32 readPixel8(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel8(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u8*)pmem)[getPixelAddress8(x, y, bp, bw)]; } -static __forceinline u32 readPixel8H(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel8H(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u8*)pmem)[4*getPixelAddress32(x, y, bp, bw) + 3]; } -static __forceinline u32 readPixel4(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel4(const void* pmem, int x, int y, u32 bp, u32 bw) +{ u32 addr = getPixelAddress4(x, y, bp, bw); u8 pix = ((const u8*)pmem)[addr/2]; if (addr & 0x1) @@ -320,31 +341,37 @@ static __forceinline u32 readPixel4(const void* pmem, int x, int y, u32 bp, u32 else return pix & 0xf; } -static __forceinline u32 readPixel4HL(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel4HL(const void* pmem, int x, int y, u32 bp, u32 bw) +{ const u8 *p = (const u8*)pmem+4*getPixelAddress4HL(x, y, bp, bw)+3; return *p & 0x0f; } -static __forceinline u32 readPixel4HH(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel4HH(const void* pmem, int x, int y, u32 bp, u32 bw) +{ const u8 *p = (const u8*)pmem+4*getPixelAddress4HH(x, y, bp, bw) + 3; return *p >> 4; } /////////////// -static __forceinline u32 readPixel32Z(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel32Z(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32Z(x, y, bp, bw)]; } -static __forceinline u32 readPixel24Z(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel24Z(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32Z(x, y, bp, bw)] & 0xffffff; } -static __forceinline u32 readPixel16Z(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel16Z(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16Z(x, y, bp, bw)]; } -static __forceinline u32 readPixel16SZ(const void* pmem, int x, int y, u32 bp, u32 bw) { +static __forceinline u32 readPixel16SZ(const void* pmem, int x, int y, u32 bp, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16SZ(x, y, bp, bw)]; } @@ -352,135 +379,154 @@ static __forceinline u32 readPixel16SZ(const void* pmem, int x, int y, u32 bp, // Functions that take 0 bps // /////////////////////////////// -static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel32_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u32*)pmem)[getPixelAddress32_0(x, y, bw)] = pixel; } -static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel24_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ u8 *buf = (u8*)&((u32*)pmem)[getPixelAddress32_0(x, y, bw)]; u8 *pix = (u8*)&pixel; -#if defined(_MSC_VER) && defined(__x86_64__) - memcpy(buf, pix, 3); -#else buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2]; -#endif } -static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel16_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u16*)pmem)[getPixelAddress16_0(x, y, bw)] = pixel; } -static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel16S_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u16*)pmem)[getPixelAddress16S_0(x, y, bw)] = pixel; } -static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel8_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u8*)pmem)[getPixelAddress8_0(x, y, bw)] = pixel; } -static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel8H_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u8*)pmem)[4*getPixelAddress32_0(x, y, bw)+3] = pixel; } -static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel4_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ u32 addr = getPixelAddress4_0(x, y, bw); u8 pix = ((u8*)pmem)[addr/2]; if (addr & 0x1) ((u8*)pmem)[addr/2] = (pix & 0x0f) | (pixel << 4); else ((u8*)pmem)[addr/2] = (pix & 0xf0) | (pixel); } -static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel4HL_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ u8 *p = (u8*)pmem + 4*getPixelAddress4HL_0(x, y, bw)+3; *p = (*p & 0xf0) | pixel; } -static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel4HH_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ u8 *p = (u8*)pmem + 4*getPixelAddress4HH_0(x, y, bw)+3; *p = (*p & 0x0f) | (pixel<<4); } -static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel32Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] = pixel; } -static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel24Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ u8 *buf = (u8*)pmem + 4*getPixelAddress32Z_0(x, y, bw); u8 *pix = (u8*)&pixel; -#if defined(_MSC_VER) && defined(__x86_64__) - memcpy(buf, pix, 3); -#else buf[0] = pix[0]; buf[1] = pix[1]; buf[2] = pix[2]; -#endif } -static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel16Z_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u16*)pmem)[getPixelAddress16Z_0(x, y, bw)] = pixel; } -static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw) { +static __forceinline void writePixel16SZ_0(void* pmem, int x, int y, u32 pixel, u32 bw) +{ ((u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)] = pixel; } /////////////// -static __forceinline u32 readPixel32_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel32_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)]; } -static __forceinline u32 readPixel24_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel24_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32_0(x, y, bw)] & 0xffffff; } -static __forceinline u32 readPixel16_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel16_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16_0(x, y, bw)]; } -static __forceinline u32 readPixel16S_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel16S_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16S_0(x, y, bw)]; } -static __forceinline u32 readPixel8_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel8_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u8*)pmem)[getPixelAddress8_0(x, y, bw)]; } -static __forceinline u32 readPixel8H_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel8H_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u8*)pmem)[4*getPixelAddress32_0(x, y, bw) + 3]; } -static __forceinline u32 readPixel4_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel4_0(const void* pmem, int x, int y, u32 bw) +{ u32 addr = getPixelAddress4_0(x, y, bw); u8 pix = ((const u8*)pmem)[addr/2]; if (addr & 0x1) - return pix >> 4; - else return pix & 0xf; + return pix >> 4; + else + return pix & 0xf; } -static __forceinline u32 readPixel4HL_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel4HL_0(const void* pmem, int x, int y, u32 bw) +{ const u8 *p = (const u8*)pmem+4*getPixelAddress4HL_0(x, y, bw)+3; return *p & 0x0f; } -static __forceinline u32 readPixel4HH_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel4HH_0(const void* pmem, int x, int y, u32 bw) +{ const u8 *p = (const u8*)pmem+4*getPixelAddress4HH_0(x, y, bw) + 3; return *p >> 4; } /////////////// -static __forceinline u32 readPixel32Z_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel32Z_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)]; } -static __forceinline u32 readPixel24Z_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel24Z_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u32*)pmem)[getPixelAddress32Z_0(x, y, bw)] & 0xffffff; } -static __forceinline u32 readPixel16Z_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel16Z_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16Z_0(x, y, bw)]; } -static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw) { +static __forceinline u32 readPixel16SZ_0(const void* pmem, int x, int y, u32 bw) +{ return ((const u16*)pmem)[getPixelAddress16SZ_0(x, y, bw)]; } diff --git a/plugins/zzogl-pg/opengl/Mem_Swizzle.h b/plugins/zzogl-pg/opengl/Mem_Swizzle.h new file mode 100644 index 0000000000..b40d6673ed --- /dev/null +++ b/plugins/zzogl-pg/opengl/Mem_Swizzle.h @@ -0,0 +1,123 @@ +/* ZeroGS KOSMOS + * Copyright (C) 2005-2006 zerofrog@gmail.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef MEM_SWIZZLE_H_INCLUDED +#define MEM_SWIZZLE_H_INCLUDED + +#include "GS.h" +#include "Mem.h" + +// special swizzle macros - which I converted to functions. + +static __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff) +{ + u8* pnewsrc = src; + u32* pblock = tempblock; + + for(int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch-24) + { + for(int bx = 0; bx < 8; ++bx, pnewsrc += 3) + { + pblock[bx] = *(u32*)pnewsrc; + } + } + + for(int bx = 0; bx < 7; ++bx, pnewsrc += 3) + { + /* might be 1 byte out of bounds of GS memory */ + pblock[bx] = *(u32*)pnewsrc; + } + + /* do 3 bytes for the last copy */ + *((u8*)pblock+28) = pnewsrc[0]; + *((u8*)pblock+29) = pnewsrc[1]; + *((u8*)pblock+30) = pnewsrc[2]; + SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff); +} + +#define SwizzleBlock24u SwizzleBlock24 + +static __forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff) +{ + u8* pnewsrc = src; + u32* pblock = tempblock; + + for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) + { + u32 u = *(u32*)pnewsrc; + pblock[0] = u<<24; + pblock[1] = u<<16; + pblock[2] = u<<8; + pblock[3] = u; + u = *(u32*)(pnewsrc+4); + pblock[4] = u<<24; + pblock[5] = u<<16; + pblock[6] = u<<8; + pblock[7] = u; + } + SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xff000000); +} + +#define SwizzleBlock8Hu SwizzleBlock8H + +static __forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff) +{ + u8* pnewsrc = src; + u32* pblock = tempblock; + + for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) + { + u32 u = *(u32*)pnewsrc; + pblock[0] = u<<28; + pblock[1] = u<<24; + pblock[2] = u<<20; + pblock[3] = u<<16; + pblock[4] = u<<12; + pblock[5] = u<<8; + pblock[6] = u<<4; + pblock[7] = u; + } + SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0xf0000000); +} + +#define SwizzleBlock4HHu SwizzleBlock4HH + +static __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask = 0xffffffff) +{ + u8* pnewsrc = src; + u32* pblock = tempblock; + + for(int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch) + { + u32 u = *(u32*)pnewsrc; + pblock[0] = u<<24; + pblock[1] = u<<20; + pblock[2] = u<<16; + pblock[3] = u<<12; + pblock[4] = u<<8; + pblock[5] = u<<4; + pblock[6] = u; + pblock[7] = u>>4; + } + SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000); +} + +#define SwizzleBlock4HLu SwizzleBlock4HL + + +#endif // MEM_SWIZZLE_H_INCLUDED diff --git a/plugins/zzogl-pg/opengl/Mem_Tables.cpp b/plugins/zzogl-pg/opengl/Mem_Tables.cpp new file mode 100644 index 0000000000..2c40ae1b4c --- /dev/null +++ b/plugins/zzogl-pg/opengl/Mem_Tables.cpp @@ -0,0 +1,236 @@ +/* ZeroGS KOSMOS + * Copyright (C) 2005-2006 zerofrog@gmail.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "GS.h" + +u32 g_blockTable32[4][8] = { + { 0, 1, 4, 5, 16, 17, 20, 21}, + { 2, 3, 6, 7, 18, 19, 22, 23}, + { 8, 9, 12, 13, 24, 25, 28, 29}, + { 10, 11, 14, 15, 26, 27, 30, 31} +}; + +u32 g_blockTable32Z[4][8] = { + { 24, 25, 28, 29, 8, 9, 12, 13}, + { 26, 27, 30, 31, 10, 11, 14, 15}, + { 16, 17, 20, 21, 0, 1, 4, 5}, + { 18, 19, 22, 23, 2, 3, 6, 7} +}; + +u32 g_blockTable16[8][4] = { + { 0, 2, 8, 10 }, + { 1, 3, 9, 11 }, + { 4, 6, 12, 14 }, + { 5, 7, 13, 15 }, + { 16, 18, 24, 26 }, + { 17, 19, 25, 27 }, + { 20, 22, 28, 30 }, + { 21, 23, 29, 31 } +}; + +u32 g_blockTable16S[8][4] = { + { 0, 2, 16, 18 }, + { 1, 3, 17, 19 }, + { 8, 10, 24, 26 }, + { 9, 11, 25, 27 }, + { 4, 6, 20, 22 }, + { 5, 7, 21, 23 }, + { 12, 14, 28, 30 }, + { 13, 15, 29, 31 } +}; + +u32 g_blockTable16Z[8][4] = { + { 24, 26, 16, 18 }, + { 25, 27, 17, 19 }, + { 28, 30, 20, 22 }, + { 29, 31, 21, 23 }, + { 8, 10, 0, 2 }, + { 9, 11, 1, 3 }, + { 12, 14, 4, 6 }, + { 13, 15, 5, 7 } +}; + +u32 g_blockTable16SZ[8][4] = { + { 24, 26, 8, 10 }, + { 25, 27, 9, 11 }, + { 16, 18, 0, 2 }, + { 17, 19, 1, 3 }, + { 28, 30, 12, 14 }, + { 29, 31, 13, 15 }, + { 20, 22, 4, 6 }, + { 21, 23, 5, 7 } +}; + +u32 g_blockTable8[4][8] = { + { 0, 1, 4, 5, 16, 17, 20, 21}, + { 2, 3, 6, 7, 18, 19, 22, 23}, + { 8, 9, 12, 13, 24, 25, 28, 29}, + { 10, 11, 14, 15, 26, 27, 30, 31} +}; + +u32 g_blockTable4[8][4] = { + { 0, 2, 8, 10 }, + { 1, 3, 9, 11 }, + { 4, 6, 12, 14 }, + { 5, 7, 13, 15 }, + { 16, 18, 24, 26 }, + { 17, 19, 25, 27 }, + { 20, 22, 28, 30 }, + { 21, 23, 29, 31 } +}; + +u32 g_columnTable32[8][8] = { + { 0, 1, 4, 5, 8, 9, 12, 13 }, + { 2, 3, 6, 7, 10, 11, 14, 15 }, + { 16, 17, 20, 21, 24, 25, 28, 29 }, + { 18, 19, 22, 23, 26, 27, 30, 31 }, + { 32, 33, 36, 37, 40, 41, 44, 45 }, + { 34, 35, 38, 39, 42, 43, 46, 47 }, + { 48, 49, 52, 53, 56, 57, 60, 61 }, + { 50, 51, 54, 55, 58, 59, 62, 63 }, +}; + +u32 g_columnTable16[8][16] = { + { 0, 2, 8, 10, 16, 18, 24, 26, + 1, 3, 9, 11, 17, 19, 25, 27 }, + { 4, 6, 12, 14, 20, 22, 28, 30, + 5, 7, 13, 15, 21, 23, 29, 31 }, + { 32, 34, 40, 42, 48, 50, 56, 58, + 33, 35, 41, 43, 49, 51, 57, 59 }, + { 36, 38, 44, 46, 52, 54, 60, 62, + 37, 39, 45, 47, 53, 55, 61, 63 }, + { 64, 66, 72, 74, 80, 82, 88, 90, + 65, 67, 73, 75, 81, 83, 89, 91 }, + { 68, 70, 76, 78, 84, 86, 92, 94, + 69, 71, 77, 79, 85, 87, 93, 95 }, + { 96, 98, 104, 106, 112, 114, 120, 122, + 97, 99, 105, 107, 113, 115, 121, 123 }, + { 100, 102, 108, 110, 116, 118, 124, 126, + 101, 103, 109, 111, 117, 119, 125, 127 }, +}; + +u32 g_columnTable8[16][16] = { + { 0, 4, 16, 20, 32, 36, 48, 52, // column 0 + 2, 6, 18, 22, 34, 38, 50, 54 }, + { 8, 12, 24, 28, 40, 44, 56, 60, + 10, 14, 26, 30, 42, 46, 58, 62 }, + { 33, 37, 49, 53, 1, 5, 17, 21, + 35, 39, 51, 55, 3, 7, 19, 23 }, + { 41, 45, 57, 61, 9, 13, 25, 29, + 43, 47, 59, 63, 11, 15, 27, 31 }, + { 96, 100, 112, 116, 64, 68, 80, 84, // column 1 + 98, 102, 114, 118, 66, 70, 82, 86 }, + { 104, 108, 120, 124, 72, 76, 88, 92, + 106, 110, 122, 126, 74, 78, 90, 94 }, + { 65, 69, 81, 85, 97, 101, 113, 117, + 67, 71, 83, 87, 99, 103, 115, 119 }, + { 73, 77, 89, 93, 105, 109, 121, 125, + 75, 79, 91, 95, 107, 111, 123, 127 }, + { 128, 132, 144, 148, 160, 164, 176, 180, // column 2 + 130, 134, 146, 150, 162, 166, 178, 182 }, + { 136, 140, 152, 156, 168, 172, 184, 188, + 138, 142, 154, 158, 170, 174, 186, 190 }, + { 161, 165, 177, 181, 129, 133, 145, 149, + 163, 167, 179, 183, 131, 135, 147, 151 }, + { 169, 173, 185, 189, 137, 141, 153, 157, + 171, 175, 187, 191, 139, 143, 155, 159 }, + { 224, 228, 240, 244, 192, 196, 208, 212, // column 3 + 226, 230, 242, 246, 194, 198, 210, 214 }, + { 232, 236, 248, 252, 200, 204, 216, 220, + 234, 238, 250, 254, 202, 206, 218, 222 }, + { 193, 197, 209, 213, 225, 229, 241, 245, + 195, 199, 211, 215, 227, 231, 243, 247 }, + { 201, 205, 217, 221, 233, 237, 249, 253, + 203, 207, 219, 223, 235, 239, 251, 255 }, +}; + +u32 g_columnTable4[16][32] = { + { 0, 8, 32, 40, 64, 72, 96, 104, // column 0 + 2, 10, 34, 42, 66, 74, 98, 106, + 4, 12, 36, 44, 68, 76, 100, 108, + 6, 14, 38, 46, 70, 78, 102, 110 }, + { 16, 24, 48, 56, 80, 88, 112, 120, + 18, 26, 50, 58, 82, 90, 114, 122, + 20, 28, 52, 60, 84, 92, 116, 124, + 22, 30, 54, 62, 86, 94, 118, 126 }, + { 65, 73, 97, 105, 1, 9, 33, 41, + 67, 75, 99, 107, 3, 11, 35, 43, + 69, 77, 101, 109, 5, 13, 37, 45, + 71, 79, 103, 111, 7, 15, 39, 47 }, + { 81, 89, 113, 121, 17, 25, 49, 57, + 83, 91, 115, 123, 19, 27, 51, 59, + 85, 93, 117, 125, 21, 29, 53, 61, + 87, 95, 119, 127, 23, 31, 55, 63 }, + { 192, 200, 224, 232, 128, 136, 160, 168, // column 1 + 194, 202, 226, 234, 130, 138, 162, 170, + 196, 204, 228, 236, 132, 140, 164, 172, + 198, 206, 230, 238, 134, 142, 166, 174 }, + { 208, 216, 240, 248, 144, 152, 176, 184, + 210, 218, 242, 250, 146, 154, 178, 186, + 212, 220, 244, 252, 148, 156, 180, 188, + 214, 222, 246, 254, 150, 158, 182, 190 }, + { 129, 137, 161, 169, 193, 201, 225, 233, + 131, 139, 163, 171, 195, 203, 227, 235, + 133, 141, 165, 173, 197, 205, 229, 237, + 135, 143, 167, 175, 199, 207, 231, 239 }, + { 145, 153, 177, 185, 209, 217, 241, 249, + 147, 155, 179, 187, 211, 219, 243, 251, + 149, 157, 181, 189, 213, 221, 245, 253, + 151, 159, 183, 191, 215, 223, 247, 255 }, + { 256, 264, 288, 296, 320, 328, 352, 360, // column 2 + 258, 266, 290, 298, 322, 330, 354, 362, + 260, 268, 292, 300, 324, 332, 356, 364, + 262, 270, 294, 302, 326, 334, 358, 366 }, + { 272, 280, 304, 312, 336, 344, 368, 376, + 274, 282, 306, 314, 338, 346, 370, 378, + 276, 284, 308, 316, 340, 348, 372, 380, + 278, 286, 310, 318, 342, 350, 374, 382 }, + { 321, 329, 353, 361, 257, 265, 289, 297, + 323, 331, 355, 363, 259, 267, 291, 299, + 325, 333, 357, 365, 261, 269, 293, 301, + 327, 335, 359, 367, 263, 271, 295, 303 }, + { 337, 345, 369, 377, 273, 281, 305, 313, + 339, 347, 371, 379, 275, 283, 307, 315, + 341, 349, 373, 381, 277, 285, 309, 317, + 343, 351, 375, 383, 279, 287, 311, 319 }, + { 448, 456, 480, 488, 384, 392, 416, 424, // column 3 + 450, 458, 482, 490, 386, 394, 418, 426, + 452, 460, 484, 492, 388, 396, 420, 428, + 454, 462, 486, 494, 390, 398, 422, 430 }, + { 464, 472, 496, 504, 400, 408, 432, 440, + 466, 474, 498, 506, 402, 410, 434, 442, + 468, 476, 500, 508, 404, 412, 436, 444, + 470, 478, 502, 510, 406, 414, 438, 446 }, + { 385, 393, 417, 425, 449, 457, 481, 489, + 387, 395, 419, 427, 451, 459, 483, 491, + 389, 397, 421, 429, 453, 461, 485, 493, + 391, 399, 423, 431, 455, 463, 487, 495 }, + { 401, 409, 433, 441, 465, 473, 497, 505, + 403, 411, 435, 443, 467, 475, 499, 507, + 405, 413, 437, 445, 469, 477, 501, 509, + 407, 415, 439, 447, 471, 479, 503, 511 }, +}; + +u32 g_pageTable32[32][64]; +u32 g_pageTable32Z[32][64]; +u32 g_pageTable16[64][64]; +u32 g_pageTable16S[64][64]; +u32 g_pageTable16Z[64][64]; +u32 g_pageTable16SZ[64][64]; +u32 g_pageTable8[64][128]; +u32 g_pageTable4[128][128]; diff --git a/plugins/zzogl-pg/opengl/Mem_Transmit.h b/plugins/zzogl-pg/opengl/Mem_Transmit.h new file mode 100644 index 0000000000..9e84ef2dc9 --- /dev/null +++ b/plugins/zzogl-pg/opengl/Mem_Transmit.h @@ -0,0 +1,184 @@ +#ifndef MEM_TRANSMIT_H_INCLUDED +#define MEM_TRANSMIT_H_INCLUDED + +#include "GS.h" +#include "Mem.h" + +#define DSTPSM gs.dstbuf.psm + +// transfers whole rows +#define TRANSMIT_HOSTLOCAL_Y_(psm, T, widthlimit, endY) { \ + assert( (nSize%widthlimit) == 0 && widthlimit <= 4 ); \ + if( (gs.imageEndX-gs.trxpos.dx)%widthlimit ) { \ + /*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \ + for(; i < endY; ++i) { \ + for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 1) { \ + /* write as many pixel at one time as possible */ \ + writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \ + } \ + } \ + } \ + for(; i < endY; ++i) { \ + for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += widthlimit) { \ + /* write as many pixel at one time as possible */ \ + if( nSize < widthlimit ) goto End; \ + writePixel##psm##_0(pstart, j%2048, i%2048, pbuf[0], gs.dstbuf.bw); \ + \ + if( widthlimit > 1 ) { \ + writePixel##psm##_0(pstart, (j+1)%2048, i%2048, pbuf[1], gs.dstbuf.bw); \ + \ + if( widthlimit > 2 ) { \ + writePixel##psm##_0(pstart, (j+2)%2048, i%2048, pbuf[2], gs.dstbuf.bw); \ + \ + if( widthlimit > 3 ) { \ + writePixel##psm##_0(pstart, (j+3)%2048, i%2048, pbuf[3], gs.dstbuf.bw); \ + } \ + } \ + } \ + } \ + \ + if( j >= gs.imageEndX ) { assert(j == gs.imageEndX); j = gs.trxpos.dx; } \ + else { assert( gs.imageTransfer == -1 || nSize*sizeof(T)/4 == 0 ); goto End; } \ + } \ +} \ + +// transmit until endX, don't check size since it has already been prevalidated +#define TRANSMIT_HOSTLOCAL_X_(psm, T, widthlimit, blockheight, startX) { \ + for(int tempi = 0; tempi < blockheight; ++tempi) { \ + for(j = startX; j < gs.imageEndX; j++, pbuf++) { \ + writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0], gs.dstbuf.bw); \ + } \ + pbuf += pitch-fracX; \ + } \ +} \ + +//template +//static __forceinline void TransmitHostLocalX_(_writePixel_0 wp, u32 widthlimit, u32 blockheight, u32 startX) +//{ +// for(int tempi = 0; tempi < blockheight; ++tempi) +// { +// for(j = startX; j < gs.imageEndX; j++, pbuf++) +// { +// wp(pstart, j%2048, (i+tempi)%2048, pbuf[0], gs.dstbuf.bw); +// } +// pbuf += pitch - fracX; +// } +//} + +// transfers whole rows +#define TRANSMIT_HOSTLOCAL_Y_24(psm, T, widthlimit, endY) { \ + if( widthlimit != 8 || ((gs.imageEndX-gs.trxpos.dx)%widthlimit) ) { \ + /*GS_LOG("Bad Transmission! %d %d, psm: %d\n", gs.trxpos.dx, gs.imageEndX, DSTPSM);*/ \ + for(; i < endY; ++i) { \ + for(; j < gs.imageEndX && nSize > 0; j += 1, nSize -= 1, pbuf += 3) { \ + writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf), gs.dstbuf.bw); \ + } \ + \ + if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \ + else { assert( gs.imageTransfer == -1 || nSize == 0 ); goto End; } \ + } \ + } \ + else { \ + assert( /*(nSize%widthlimit) == 0 &&*/ widthlimit == 8 ); \ + for(; i < endY; ++i) { \ + for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit, pbuf += 3*widthlimit) { \ + if( nSize < widthlimit ) goto End; \ + /* write as many pixel at one time as possible */ \ + writePixel##psm##_0(pstart, j%2048, i%2048, *(u32*)(pbuf+0), gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *(u32*)(pbuf+3), gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *(u32*)(pbuf+6), gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *(u32*)(pbuf+9), gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *(u32*)(pbuf+12), gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *(u32*)(pbuf+15), gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *(u32*)(pbuf+18), gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *(u32*)(pbuf+21), gs.dstbuf.bw); \ + } \ + \ + if( j >= gs.imageEndX ) { assert(gs.imageTransfer == -1 || j == gs.imageEndX); j = gs.trxpos.dx; } \ + else { \ + if( nSize < 0 ) { \ + /* extracted too much */ \ + assert( (nSize%3)==0 && nSize > -24 ); \ + j += nSize/3; \ + nSize = 0; \ + } \ + assert( gs.imageTransfer == -1 || nSize == 0 ); \ + goto End; \ + } \ + } \ + } \ +} \ + +// transmit until endX, don't check size since it has already been prevalidated +#define TRANSMIT_HOSTLOCAL_X_24(psm, T, widthlimit, blockheight, startX) { \ + for(int tempi = 0; tempi < blockheight; ++tempi) { \ + for(j = startX; j < gs.imageEndX; j++, pbuf += 3) { \ + writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, *(u32*)pbuf, gs.dstbuf.bw); \ + } \ + pbuf += 3*(pitch-fracX); \ + } \ +} \ + + +// meant for 4bit transfers +#define TRANSMIT_HOSTLOCAL_Y_4(psm, T, widthlimit, endY) { \ + for(; i < endY; ++i) { \ + for(; j < gs.imageEndX && nSize > 0; j += widthlimit, nSize -= widthlimit) { \ + /* write as many pixel at one time as possible */ \ + writePixel##psm##_0(pstart, j%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+1)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ + pbuf++; \ + if( widthlimit > 2 ) { \ + writePixel##psm##_0(pstart, (j+2)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+3)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ + pbuf++; \ + \ + if( widthlimit > 4 ) { \ + writePixel##psm##_0(pstart, (j+4)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+5)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ + pbuf++; \ + \ + if( widthlimit > 6 ) { \ + writePixel##psm##_0(pstart, (j+6)%2048, i%2048, *pbuf&0x0f, gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+7)%2048, i%2048, *pbuf>>4, gs.dstbuf.bw); \ + pbuf++; \ + } \ + } \ + } \ + } \ + \ + if( j >= gs.imageEndX ) { j = gs.trxpos.dx; } \ + else { assert( gs.imageTransfer == -1 || (nSize/32) == 0 ); goto End; } \ + } \ +} \ + +// transmit until endX, don't check size since it has already been prevalidated +#define TRANSMIT_HOSTLOCAL_X_4(psm, T, widthlimit, blockheight, startX) { \ + for(int tempi = 0; tempi < blockheight; ++tempi) { \ + for(j = startX; j < gs.imageEndX; j+=2, pbuf++) { \ + writePixel##psm##_0(pstart, j%2048, (i+tempi)%2048, pbuf[0]&0x0f, gs.dstbuf.bw); \ + writePixel##psm##_0(pstart, (j+1)%2048, (i+tempi)%2048, pbuf[0]>>4, gs.dstbuf.bw); \ + } \ + pbuf += (pitch-fracX)/2; \ + } \ +} \ + +#define TRANSMIT_HOSTLOCAL_X(th, psm, T, widthlimit, blockheight, startX) \ + TRANSMIT_HOSTLOCAL_X##th(psm, T, widthlimit, blockheight, startX) +#define TRANSMIT_HOSTLOCAL_Y(th, psm, T, widthlimit, endY) \ + TRANSMIT_HOSTLOCAL_Y##th(psm,T,widthlimit,endY) +// calculate pitch in source buffer + + +template +static __forceinline int TransmitPitch_(int pitch) { return (pitch * sizeof(T)); } +template +static __forceinline int TransmitPitch_24(int pitch) { return (pitch * 3); } +template +static __forceinline int TransmitPitch_4(int pitch) { return (pitch/2); } + +#define TRANSMIT_PITCH_(pitch, T) TransmitPitch_(pitch) +#define TRANSMIT_PITCH_24(pitch, T) TransmitPitch_24(pitch) +#define TRANSMIT_PITCH_4(pitch, T) TransmitPitch_4(pitch) + +#endif // MEM_TRANSMIT_H_INCLUDED diff --git a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj index ebdb1bac16..74ce2d3ad4 100644 --- a/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj +++ b/plugins/zzogl-pg/opengl/Win32/zerogsogl_2008.vcproj @@ -323,6 +323,10 @@ RelativePath="..\Mem.cpp" > + + @@ -439,6 +443,15 @@ RelativePath="..\Mem.h" > + + + + +