diff --git a/plugins/zzogl-pg/opengl/HostMemory.cpp b/plugins/zzogl-pg/opengl/HostMemory.cpp index 93ce759e48..aeb9960612 100644 --- a/plugins/zzogl-pg/opengl/HostMemory.cpp +++ b/plugins/zzogl-pg/opengl/HostMemory.cpp @@ -135,8 +135,7 @@ #if defined(ZEROGS_DEVBUILD) if (gs.trxpos.dx + gs.imageWnew > gs.dstbuf.bw) - ZZLog::Warn_Log("Transfer error, width exceeded."); - + ZZLog::Debug_Log("Transfer error, width exceeded."); #endif //bool bHasFlushed = false; @@ -260,10 +259,8 @@ assert(gs.trxpos.sx + gs.imageWnew <= 2048 && gs.trxpos.sy + gs.imageHnew <= 2048); #if defined(ZEROGS_DEVBUILD) - if (gs.trxpos.sx + gs.imageWnew > gs.srcbuf.bw) - ZZLog::Warn_Log("Transfer error, width exceeded."); - + ZZLog::Debug_Log("Transfer error, width exceeded."); #endif gs.imageX = gs.trxpos.sx; @@ -468,10 +465,10 @@ __forceinline void _TransferLocalLocal_4() assert((gs.srcbuf.psm&0x7) == (gs.dstbuf.psm&0x7)); if (gs.trxpos.sx + gs.imageWnew > gs.srcbuf.bw) - ZZLog::Warn_Log("Transfer error, src width exceeded."); + ZZLog::Debug_Log("Transfer error, src width exceeded."); if (gs.trxpos.dx + gs.imageWnew > gs.dstbuf.bw) - ZZLog::Warn_Log("Transfer error, dst width exceeded."); + ZZLog::Debug_Log("Transfer error, dst width exceeded."); int srcstart, srcend, dststart, dstend; diff --git a/plugins/zzogl-pg/opengl/x86.cpp b/plugins/zzogl-pg/opengl/x86.cpp index d32718dbfb..bab966fd2d 100644 --- a/plugins/zzogl-pg/opengl/x86.cpp +++ b/plugins/zzogl-pg/opengl/x86.cpp @@ -27,172 +27,520 @@ #endif // swizzling -#define _FrameSwizzleBlock(type, transfer, transfer16, incsrc) \ -/* FrameSwizzleBlock32 */ \ -void __fastcall FrameSwizzleBlock32##type##c(u32* dst, u32* src, int srcpitch, u32 WriteMask) \ -{ \ - u32* d = &g_columnTable32[0][0]; \ - \ - if( WriteMask == 0xffffffff ) { \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - dst[d[j]] = (transfer); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - else { \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - dst[d[j]] = ((transfer)&WriteMask)|(dst[d[j]]&~WriteMask); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ -} \ -\ -void __fastcall FrameSwizzleBlock24##type##c(u32* dst, u32* src, int srcpitch, u32 WriteMask) \ -{ \ - u32* d = &g_columnTable32[0][0]; \ - \ - if( WriteMask == 0xffffffff ) { \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - dst[d[j]] = (transfer); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - else { \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - dst[d[j]] = ((transfer)&WriteMask)|(dst[d[j]]&~WriteMask); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ -} \ -\ -/* FrameSwizzleBlock16 */ \ -void __fastcall FrameSwizzleBlock16##type##c(u16* dst, u32* src, int srcpitch, u32 WriteMask) \ -{ \ - u32* d = &g_columnTable16[0][0]; \ - \ - if( WriteMask == 0xffff ) { \ - for(int i = 0; i < 8; ++i, d += 16) { \ - for(int j = 0; j < 16; ++j) { \ - u32 temp = (transfer); \ - dst[d[j]] = RGBA32to16(temp); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - else { \ - for(int i = 0; i < 8; ++i, d += 16) { \ - for(int j = 0; j < 16; ++j) { \ - u32 temp = (transfer); \ - u32 dsrc = RGBA32to16(temp); \ - dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ -} \ -\ -/* Frame16SwizzleBlock32 */ \ -void __fastcall Frame16SwizzleBlock32##type##c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) \ -{ \ - u32* d = &g_columnTable32[0][0]; \ -\ - if( WriteMask == 0xffffffff ) { \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - Vector_16F dsrc16 = (transfer16); \ - dst[d[j]] = Float16ToARGB(dsrc16); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - else { \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - Vector_16F dsrc16 = (transfer16); \ - u32 dsrc = Float16ToARGB(dsrc16); \ - dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - } \ -\ -/* Frame16SwizzleBlock32Z */ \ -void __fastcall Frame16SwizzleBlock32Z##type##c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) \ -{ \ - u32* d = &g_columnTable32[0][0]; \ - if( WriteMask == 0xffffffff ) { /* breaks KH text if not checked */ \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - Vector_16F dsrc16 = (transfer16); \ - dst[d[j]] = Float16ToARGB_Z(dsrc16); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - else { \ - for(int i = 0; i < 8; ++i, d += 8) { \ - for(int j = 0; j < 8; ++j) { \ - Vector_16F dsrc16 = (transfer16); \ - u32 dsrc = Float16ToARGB_Z(dsrc16); \ - dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - } \ - \ - /* Frame16SwizzleBlock16 */ \ -void __fastcall Frame16SwizzleBlock16##type##c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) \ -{ \ - u32* d = &g_columnTable16[0][0]; \ - \ - if( (WriteMask&0xfff8f8f8) == 0xfff8f8f8) { \ - for(int i = 0; i < 8; ++i, d += 16) { \ - for(int j = 0; j < 16; ++j) { \ - Vector_16F dsrc16 = (transfer16); \ - dst[d[j]] = Float16ToARGB16(dsrc16); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - else { \ - for(int i = 0; i < 8; ++i, d += 16) { \ - for(int j = 0; j < 16; ++j) { \ - Vector_16F dsrc16 = (transfer16); \ - u32 dsrc = Float16ToARGB16(dsrc16); \ - dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); \ - } \ - src += srcpitch << incsrc; \ - } \ - } \ - } \ - \ - /* Frame16SwizzleBlock16Z */ \ -void __fastcall Frame16SwizzleBlock16Z##type##c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) \ -{ \ - u32* d = &g_columnTable16[0][0]; \ - \ - for(int i = 0; i < 8; ++i, d += 16) { \ - for(int j = 0; j < 16; ++j) { \ - Vector_16F dsrc16 = (transfer16); \ - dst[d[j]] = Float16ToARGB16_Z(dsrc16); \ - } \ - src += srcpitch << incsrc; \ - } \ -} \ + +//These were only used in the old version of RESOLVE_32_BITS. Keeping for reference. +#if 0 + +/* FrameSwizzleBlock32 */ +void __fastcall FrameSwizzleBlock32_c(u32* dst, u32* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + + if (WriteMask == 0xffffffff) + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + dst[d[j]] = (src[j]); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + dst[d[j]] = ((src[j])&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } +} + +void __fastcall FrameSwizzleBlock32A2_c(u32* dst, u32* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + + if( WriteMask == 0xffffffff ) + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + dst[d[j]] = ((src[2*j] + src[2*j+1]) >> 1); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + dst[d[j]] = (((src[2*j] + src[2*j+1]) >> 1)&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } +} + +void __fastcall FrameSwizzleBlock32A4_c(u32* dst, u32* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + + if( WriteMask == 0xffffffff ) + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + dst[d[j]] = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2); + } + src += srcpitch << 1; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + dst[d[j]] = (((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2)&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch << 1; + } + } +} + +#define FrameSwizzleBlock24_c FrameSwizzleBlock32_c +#define FrameSwizzleBlock24A2_c FrameSwizzleBlock32A2_c +#define FrameSwizzleBlock24A4_c FrameSwizzleBlock32A4_c + +/* FrameSwizzleBlock16 */ +void __fastcall FrameSwizzleBlock16_c(u16* dst, u32* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + if (WriteMask == 0xffff) + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + u32 temp = (src[j]); + dst[d[j]] = RGBA32to16(temp); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + u32 temp = (src[j]); + u32 dsrc = RGBA32to16(temp); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } +} + +void __fastcall FrameSwizzleBlock16A2_c(u16* dst, u32* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + if (WriteMask == 0xffff) + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + u32 temp = ((src[2*j] + src[2*j+1]) >> 1); + dst[d[j]] = RGBA32to16(temp); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + u32 temp = ((src[2*j] + src[2*j+1]) >> 1); + u32 dsrc = RGBA32to16(temp); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } +} + +void __fastcall FrameSwizzleBlock16A4_c(u16* dst, u32* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + if (WriteMask == 0xffff) + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + u32 temp = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2); + dst[d[j]] = RGBA32to16(temp); + } + src += srcpitch << 1; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + u32 temp = ((src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2); + u32 dsrc = RGBA32to16(temp); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch << 1; + } + } +} + + +/* Frame16SwizzleBlock32 */ +void __fastcall Frame16SwizzleBlock32_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + + if( WriteMask == 0xffffffff ) + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[j]); + dst[d[j]] = Float16ToARGB(dsrc16); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[j]); + u32 dsrc = Float16ToARGB(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } + } -_FrameSwizzleBlock(_, src[j], src[j], 0); -_FrameSwizzleBlock(A2_, (src[2*j] + src[2*j+1]) >> 1, src[2*j], 0); -_FrameSwizzleBlock(A4_, (src[2*j] + src[2*j+1] + src[2*j+srcpitch] + src[2*j+srcpitch+1]) >> 2, src[2*j], 1); +void __fastcall Frame16SwizzleBlock32A2_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + + if( WriteMask == 0xffffffff ) + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB(dsrc16); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + u32 dsrc = Float16ToARGB(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } + } + +void __fastcall Frame16SwizzleBlock32A4_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + + if( WriteMask == 0xffffffff ) + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB(dsrc16); + } + src += srcpitch << 1; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + u32 dsrc = Float16ToARGB(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch << 1; + } + } + } + +/* Frame16SwizzleBlock32Z */ +void __fastcall Frame16SwizzleBlock32Z_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + if( WriteMask == 0xffffffff ) /* breaks KH text if not checked */ + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[j]); + dst[d[j]] = Float16ToARGB_Z(dsrc16); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[j]); + u32 dsrc = Float16ToARGB_Z(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } + } + +void __fastcall Frame16SwizzleBlock32ZA2_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + if( WriteMask == 0xffffffff ) /* breaks KH text if not checked */ + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB_Z(dsrc16); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + u32 dsrc = Float16ToARGB_Z(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } + } + +void __fastcall Frame16SwizzleBlock32ZA4_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable32[0][0]; + if( WriteMask == 0xffffffff ) /* breaks KH text if not checked */ + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB_Z(dsrc16); + } + src += srcpitch << 1; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 8) + { + for(int j = 0; j < 8; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + u32 dsrc = Float16ToARGB_Z(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch << 1; + } + } + } + + + /* Frame16SwizzleBlock16 */ +void __fastcall Frame16SwizzleBlock16_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + if ((WriteMask&0xfff8f8f8) == 0xfff8f8f8) + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[j]); + dst[d[j]] = Float16ToARGB16(dsrc16); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[j]); + u32 dsrc = Float16ToARGB16(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } + } + +void __fastcall Frame16SwizzleBlock16A2_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + if ((WriteMask&0xfff8f8f8) == 0xfff8f8f8) + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB16(dsrc16); + } + src += srcpitch; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + u32 dsrc = Float16ToARGB16(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch; + } + } + } + +void __fastcall Frame16SwizzleBlock16A4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + if ((WriteMask&0xfff8f8f8) == 0xfff8f8f8) + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB16(dsrc16); + } + src += srcpitch << 1; + } + } + else + { + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + u32 dsrc = Float16ToARGB16(dsrc16); + dst[d[j]] = (dsrc&WriteMask)|(dst[d[j]]&~WriteMask); + } + src += srcpitch << 1; + } + } + } + + /* Frame16SwizzleBlock16Z */ +void __fastcall Frame16SwizzleBlock16Z_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[j]); + dst[d[j]] = Float16ToARGB16_Z(dsrc16); + } + src += srcpitch; + } +} + +void __fastcall Frame16SwizzleBlock16ZA2_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB16_Z(dsrc16); + } + src += srcpitch; + } +} + +void __fastcall Frame16SwizzleBlock16ZA4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask) +{ + u32* d = &g_columnTable16[0][0]; + + for(int i = 0; i < 8; ++i, d += 16) + { + for(int j = 0; j < 16; ++j) + { + Vector_16F dsrc16 = (src[2*j]); + dst[d[j]] = Float16ToARGB16_Z(dsrc16); + } + src += srcpitch << 1; + } +} +#endif #ifdef ZEROGS_SSE2 @@ -767,4 +1115,3 @@ Z16Loop: ); #endif // _MSC_VER } - diff --git a/plugins/zzogl-pg/opengl/x86.h b/plugins/zzogl-pg/opengl/x86.h index 129a627315..ec885c630e 100644 --- a/plugins/zzogl-pg/opengl/x86.h +++ b/plugins/zzogl-pg/opengl/x86.h @@ -33,6 +33,7 @@ extern "C" void __fastcall SwizzleBlock4u_sse2(u8* dst, u8* src, int srcpitch); // frame swizzling +#if 0 // no AA extern "C" void __fastcall FrameSwizzleBlock32_sse2(u32* dst, u32* src, int srcpitch, u32 WriteMask); extern "C" void __fastcall FrameSwizzleBlock16_sse2(u16* dst, u32* src, int srcpitch, u32 WriteMask); @@ -88,6 +89,7 @@ extern void __fastcall Frame16SwizzleBlock32A4_c(u32* dst, Vector_16F* src, int extern void __fastcall Frame16SwizzleBlock32ZA4_c(u32* dst, Vector_16F* src, int srcpitch, u32 WriteMask); extern void __fastcall Frame16SwizzleBlock16A4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask); extern void __fastcall Frame16SwizzleBlock16ZA4_c(u16* dst, Vector_16F* src, int srcpitch, u32 WriteMask); +#endif extern void __fastcall SwizzleColumn32_c(int y, u8* dst, u8* src, int srcpitch, u32 WriteMask = 0xffffffff); extern void __fastcall SwizzleColumn16_c(int y, u8* dst, u8* src, int srcpitch); diff --git a/plugins/zzogl-pg/opengl/zerogsmath.h b/plugins/zzogl-pg/opengl/zerogsmath.h index 03ea3d77da..3177e710db 100644 --- a/plugins/zzogl-pg/opengl/zerogsmath.h +++ b/plugins/zzogl-pg/opengl/zerogsmath.h @@ -17,31 +17,7 @@ #include #include -#ifndef PI -#define PI ((dReal)3.141592654) -#endif - -#define rswap(x, y) *(int*)&(x) ^= *(int*)&(y) ^= *(int*)&(x) ^= *(int*)&(y); - -template inline T RAD_2_DEG(T radians) { return (radians * (T)57.29577951); } - -class Transform; - -class TransformMatrix; - typedef float dReal; -typedef dReal dMatrix3[3*4]; - -inline dReal* normalize3(dReal* pfout, const dReal* pf); -inline dReal* normalize4(dReal* pfout, const dReal* pf); -inline dReal* cross3(dReal* pfout, const dReal* pf1, const dReal* pf2); - -// multiplies 3x3 matrices -inline dReal* mult3(dReal* pfres, const dReal* pf1, const dReal* pf2); -inline double* mult3(double* pfres, const double* pf1, const double* pf2); - -inline dReal* inv3(const dReal* pf, dReal* pfres, int stride); -inline dReal* inv4(const dReal* pf, dReal* pfres); // class used for 3 and 4 dim vectors and quaternions // It is better to use this for a 3 dim vector because it is 16byte aligned and SIMD instructions can be used @@ -65,7 +41,6 @@ class Vector // SCALAR FUNCTIONS inline dReal dot(const Vector &v) const { return x*v.x + y*v.y + z*v.z + w*v.w; } - inline void normalize() { normalize4(&x, &x); } inline void Set3(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; } inline void Set4(const float* pvals) { x = pvals[0]; y = pvals[1]; z = pvals[2]; w = pvals[3]; } inline void SetColor(u32 color) @@ -77,9 +52,7 @@ class Vector // 3 dim cross product, w is not touched /// this = this x v - inline void Cross(const Vector &v) { cross3(&x, &x, v); } /// this = u x v - inline void Cross(const Vector &u, const Vector &v) { cross3(&x, u, v); } inline Vector operator-() const { Vector v; v.x = -x; v.y = -y; v.z = -z; v.w = -w; return v; } inline Vector operator+(const Vector &r) const { Vector v; v.x = x + r.x; v.y = y + r.y; v.z = z + r.z; v.w = w + r.w; return v; } inline Vector operator-(const Vector &r) const { Vector v; v.x = x - r.x; v.y = y - r.y; v.z = z - r.z; v.w = w - r.w; return v; } @@ -104,947 +77,4 @@ inline Vector operator*(float f, const Vector& left) return v; } -struct AABB -{ - Vector pos, extents; -}; - -struct OBB -{ - Vector right, up, dir, pos, extents; -}; - -struct TRIANGLE -{ - TRIANGLE() {} - - TRIANGLE(const Vector& v1, const Vector& v2, const Vector& v3) : v1(v1), v2(v2), v3(v3) {} - - ~TRIANGLE() {} - - Vector v1, v2, v3; //!< the vertices of the triangle - - const Vector& operator[](int i) const { return (&v1)[i]; } - - Vector& operator[](int i) { return (&v1)[i]; } - - /// assumes CCW ordering of vertices - inline Vector ComputeNormal() - { - Vector normal; - cross3(normal, v2 - v1, v3 - v1); - return normal; - } -}; - -// Routines made for 3D graphics that deal with 3 or 4 dim algebra structures -// Functions with postfix 3 are for 3x3 operations, etc - -// all fns return pfout on success or NULL on failure -// results and arguments can share pointers - - -// multiplies 4x4 matrices -inline dReal* mult4(dReal* pfres, const dReal* pf1, const dReal* pf2); -inline double* mult4(double* pfres, const double* pf1, const double* pf2); - -// pf1^T * pf2 -inline dReal* multtrans3(dReal* pfres, const dReal* pf1, const dReal* pf2); -inline double* multtrans3(double* pfres, const double* pf1, const double* pf2); -inline dReal* multtrans4(dReal* pfres, const dReal* pf1, const dReal* pf2); -inline double* multtrans4(double* pfres, const double* pf1, const double* pf2); - -inline dReal* transpose3(const dReal* pf, dReal* pfres); -inline double* transpose3(const double* pf, double* pfres); -inline dReal* transpose4(const dReal* pf, dReal* pfres); -inline double* transpose4(const double* pf, double* pfres); - -inline dReal dot2(const dReal* pf1, const dReal* pf2); -inline dReal dot3(const dReal* pf1, const dReal* pf2); -inline dReal dot4(const dReal* pf1, const dReal* pf2); - -inline dReal lengthsqr2(const dReal* pf); -inline dReal lengthsqr3(const dReal* pf); -inline dReal lengthsqr4(const dReal* pf); - -inline dReal* normalize2(dReal* pfout, const dReal* pf); -inline dReal* normalize3(dReal* pfout, const dReal* pf); -inline dReal* normalize4(dReal* pfout, const dReal* pf); - -//// -// More complex ops that deal with arbitrary matrices // -//// - -// extract eigen values and vectors from a 2x2 matrix and returns true if all values are real -// returned eigen vectors are normalized -inline bool eig2(const dReal* pfmat, dReal* peigs, dReal& fv1x, dReal& fv1y, dReal& fv2x, dReal& fv2y); - -// Simple routines for linear algebra algorithms // -int CubicRoots(double c0, double c1, double c2, double *r0, double *r1, double *r2); -bool QLAlgorithm3(dReal* m_aafEntry, dReal* afDiag, dReal* afSubDiag); - -void EigenSymmetric3(dReal* fCovariance, dReal* eval, dReal* fAxes); - -void GetCovarBasisVectors(dReal fCovariance[3][3], Vector* vRight, Vector* vUp, Vector* vDir); - -// first root returned is always >= second, roots are defined if the quadratic doesn't have real solutions -void QuadraticSolver(dReal* pfQuadratic, dReal* pfRoots); - -int insideQuadrilateral(const Vector* p0, const Vector* p1, const Vector* p2, const Vector* p3); -int insideTriangle(const Vector* p0, const Vector* p1, const Vector* p2); - -// multiplies a matrix by a scalar -template inline void mult(T* pf, T fa, int r); - -// multiplies a r1xc1 by c1xc2 matrix into pfres, if badd is true adds the result to pfres -// does not handle cases where pfres is equal to pf1 or pf2, use multtox for those cases -template -inline T* mult(T* pf1, R* pf2, int r1, int c1, int c2, S* pfres, bool badd = false); - -// pf1 is transposed before mult -// rows of pf2 must equal rows of pf1 -// pfres will be c1xc2 matrix -template -inline T* multtrans(T* pf1, R* pf2, int r1, int c1, int c2, S* pfres, bool badd = false); - -// pf2 is transposed before mult -// the columns of both matrices must be the same and equal to c1 -// r2 is the number of rows in pf2 -// pfres must be an r1xr2 matrix -template -inline T* multtrans_to2(T* pf1, R* pf2, int r1, int c1, int r2, S* pfres, bool badd = false); - -// multiplies rxc matrix pf1 and cxc matrix pf2 and stores the result in pf1, -// the function needs a temporary buffer the size of c doubles, if pftemp == NULL, -// the function will allocate the necessary memory, otherwise pftemp should be big -// enough to store all the entries -template inline T* multto1(T* pf1, T* pf2, int r1, int c1, T* pftemp = NULL); - -// same as multto1 except stores the result in pf2, pf1 has to be an r2xr2 matrix -// pftemp must be of size r2 if not NULL -template inline T* multto2(T* pf1, S* pf2, int r2, int c2, S* pftemp = NULL); - -// add pf1 + pf2 and store in pf1 -template inline void sub(T* pf1, T* pf2, int r); -template inline T normsqr(T* pf1, int r); -template inline T lengthsqr(T* pf1, T* pf2, int length); -template inline T dot(T* pf1, T* pf2, int length); - -template inline T sum(T* pf, int length); - -// takes the inverse of the 3x3 matrix pf and stores it into pfres, returns true if matrix is invertible -template inline bool inv2(T* pf, T* pfres); - -/////////////////////// -// Function Definitions -/////////////////////// -bool eig2(const dReal* pfmat, dReal* peigs, dReal& fv1x, dReal& fv1y, dReal& fv2x, dReal& fv2y) -{ - // x^2 + bx + c - dReal a, b, c, d; - b = -(pfmat[0] + pfmat[3]); - c = pfmat[0] * pfmat[3] - pfmat[1] * pfmat[2]; - d = b * b - 4.0f * c + 1e-16f; - - if (d < 0) return false; - - if (d < 1e-16f) - { - a = -0.5f * b; - peigs[0] = a; - peigs[1] = a; - fv1x = pfmat[1]; - fv1y = a - pfmat[0]; - c = 1 / sqrtf(fv1x * fv1x + fv1y * fv1y); - fv1x *= c; - fv1y *= c; - fv2x = -fv1y; - fv2y = fv1x; - return true; - } - - // two roots - d = sqrtf(d); - - a = -0.5f * (b + d); - peigs[0] = a; - - fv1x = pfmat[1]; - fv1y = a - pfmat[0]; - - c = 1 / sqrtf(fv1x * fv1x + fv1y * fv1y); - - fv1x *= c; - fv1y *= c; - - a += d; - peigs[1] = a; - - fv2x = pfmat[1]; - fv2y = a - pfmat[0]; - - c = 1 / sqrtf(fv2x * fv2x + fv2y * fv2y); - - fv2x *= c; - fv2y *= c; - - return true; -} - -//#ifndef TI_USING_IPP - -// Functions that are replacable by ipp library funcs -template inline T* _mult3(T* pfres, const T* pf1, const T* pf2) -{ - assert(pf1 != NULL && pf2 != NULL && pfres != NULL); - - T* pfres2; - - if (pfres == pf1 || pfres == pf2) - pfres2 = (T*)alloca(9 * sizeof(T)); - else - pfres2 = pfres; - - pfres2[0*4+0] = pf1[0*4+0] * pf2[0*4+0] + pf1[0*4+1] * pf2[1*4+0] + pf1[0*4+2] * pf2[2*4+0]; - pfres2[0*4+1] = pf1[0*4+0] * pf2[0*4+1] + pf1[0*4+1] * pf2[1*4+1] + pf1[0*4+2] * pf2[2*4+1]; - pfres2[0*4+2] = pf1[0*4+0] * pf2[0*4+2] + pf1[0*4+1] * pf2[1*4+2] + pf1[0*4+2] * pf2[2*4+2]; - - pfres2[1*4+0] = pf1[1*4+0] * pf2[0*4+0] + pf1[1*4+1] * pf2[1*4+0] + pf1[1*4+2] * pf2[2*4+0]; - pfres2[1*4+1] = pf1[1*4+0] * pf2[0*4+1] + pf1[1*4+1] * pf2[1*4+1] + pf1[1*4+2] * pf2[2*4+1]; - pfres2[1*4+2] = pf1[1*4+0] * pf2[0*4+2] + pf1[1*4+1] * pf2[1*4+2] + pf1[1*4+2] * pf2[2*4+2]; - - pfres2[2*4+0] = pf1[2*4+0] * pf2[0*4+0] + pf1[2*4+1] * pf2[1*4+0] + pf1[2*4+2] * pf2[2*4+0]; - pfres2[2*4+1] = pf1[2*4+0] * pf2[0*4+1] + pf1[2*4+1] * pf2[1*4+1] + pf1[2*4+2] * pf2[2*4+1]; - pfres2[2*4+2] = pf1[2*4+0] * pf2[0*4+2] + pf1[2*4+1] * pf2[1*4+2] + pf1[2*4+2] * pf2[2*4+2]; - - if (pfres2 != pfres) memcpy(pfres, pfres2, 9*sizeof(T)); - - return pfres; -} - -inline dReal* mult3(dReal* pfres, const dReal* pf1, const dReal* pf2) { return _mult3(pfres, pf1, pf2); } - -inline double* mult3(double* pfres, const double* pf1, const double* pf2) { return _mult3(pfres, pf1, pf2); } - -template -inline T* _mult4(T* pfres, const T* p1, const T* p2) -{ - assert(pfres != NULL && p1 != NULL && p2 != NULL); - - T* pfres2; - - if (pfres == p1 || pfres == p2) - pfres2 = (T*)alloca(16 * sizeof(T)); - else - pfres2 = pfres; - - pfres2[0*4+0] = p1[0*4+0] * p2[0*4+0] + p1[0*4+1] * p2[1*4+0] + p1[0*4+2] * p2[2*4+0] + p1[0*4+3] * p2[3*4+0]; - pfres2[0*4+1] = p1[0*4+0] * p2[0*4+1] + p1[0*4+1] * p2[1*4+1] + p1[0*4+2] * p2[2*4+1] + p1[0*4+3] * p2[3*4+1]; - pfres2[0*4+2] = p1[0*4+0] * p2[0*4+2] + p1[0*4+1] * p2[1*4+2] + p1[0*4+2] * p2[2*4+2] + p1[0*4+3] * p2[3*4+2]; - pfres2[0*4+3] = p1[0*4+0] * p2[0*4+3] + p1[0*4+1] * p2[1*4+3] + p1[0*4+2] * p2[2*4+3] + p1[0*4+3] * p2[3*4+3]; - - pfres2[1*4+0] = p1[1*4+0] * p2[0*4+0] + p1[1*4+1] * p2[1*4+0] + p1[1*4+2] * p2[2*4+0] + p1[1*4+3] * p2[3*4+0]; - pfres2[1*4+1] = p1[1*4+0] * p2[0*4+1] + p1[1*4+1] * p2[1*4+1] + p1[1*4+2] * p2[2*4+1] + p1[1*4+3] * p2[3*4+1]; - pfres2[1*4+2] = p1[1*4+0] * p2[0*4+2] + p1[1*4+1] * p2[1*4+2] + p1[1*4+2] * p2[2*4+2] + p1[1*4+3] * p2[3*4+2]; - pfres2[1*4+3] = p1[1*4+0] * p2[0*4+3] + p1[1*4+1] * p2[1*4+3] + p1[1*4+2] * p2[2*4+3] + p1[1*4+3] * p2[3*4+3]; - - pfres2[2*4+0] = p1[2*4+0] * p2[0*4+0] + p1[2*4+1] * p2[1*4+0] + p1[2*4+2] * p2[2*4+0] + p1[2*4+3] * p2[3*4+0]; - pfres2[2*4+1] = p1[2*4+0] * p2[0*4+1] + p1[2*4+1] * p2[1*4+1] + p1[2*4+2] * p2[2*4+1] + p1[2*4+3] * p2[3*4+1]; - pfres2[2*4+2] = p1[2*4+0] * p2[0*4+2] + p1[2*4+1] * p2[1*4+2] + p1[2*4+2] * p2[2*4+2] + p1[2*4+3] * p2[3*4+2]; - pfres2[2*4+3] = p1[2*4+0] * p2[0*4+3] + p1[2*4+1] * p2[1*4+3] + p1[2*4+2] * p2[2*4+3] + p1[2*4+3] * p2[3*4+3]; - - pfres2[3*4+0] = p1[3*4+0] * p2[0*4+0] + p1[3*4+1] * p2[1*4+0] + p1[3*4+2] * p2[2*4+0] + p1[3*4+3] * p2[3*4+0]; - pfres2[3*4+1] = p1[3*4+0] * p2[0*4+1] + p1[3*4+1] * p2[1*4+1] + p1[3*4+2] * p2[2*4+1] + p1[3*4+3] * p2[3*4+1]; - pfres2[3*4+2] = p1[3*4+0] * p2[0*4+2] + p1[3*4+1] * p2[1*4+2] + p1[3*4+2] * p2[2*4+2] + p1[3*4+3] * p2[3*4+2]; - pfres2[3*4+3] = p1[3*4+0] * p2[0*4+3] + p1[3*4+1] * p2[1*4+3] + p1[3*4+2] * p2[2*4+3] + p1[3*4+3] * p2[3*4+3]; - - if (pfres != pfres2) memcpy(pfres, pfres2, sizeof(T)*16); - - return pfres; -} - -inline dReal* mult4(dReal* pfres, const dReal* pf1, const dReal* pf2) { return _mult4(pfres, pf1, pf2); } -inline double* mult4(double* pfres, const double* pf1, const double* pf2) { return _mult4(pfres, pf1, pf2); } - -template -inline T* _multtrans3(T* pfres, const T* pf1, const T* pf2) -{ - T* pfres2; - - if (pfres == pf1) - pfres2 = (T*)alloca(9 * sizeof(T)); - else - pfres2 = pfres; - - pfres2[0] = pf1[0] * pf2[0] + pf1[3] * pf2[3] + pf1[6] * pf2[6]; - pfres2[1] = pf1[0] * pf2[1] + pf1[3] * pf2[4] + pf1[6] * pf2[7]; - pfres2[2] = pf1[0] * pf2[2] + pf1[3] * pf2[5] + pf1[6] * pf2[8]; - pfres2[3] = pf1[1] * pf2[0] + pf1[4] * pf2[3] + pf1[7] * pf2[6]; - pfres2[4] = pf1[1] * pf2[1] + pf1[4] * pf2[4] + pf1[7] * pf2[7]; - pfres2[5] = pf1[1] * pf2[2] + pf1[4] * pf2[5] + pf1[7] * pf2[8]; - pfres2[6] = pf1[2] * pf2[0] + pf1[5] * pf2[3] + pf1[8] * pf2[6]; - pfres2[7] = pf1[2] * pf2[1] + pf1[5] * pf2[4] + pf1[8] * pf2[7]; - pfres2[8] = pf1[2] * pf2[2] + pf1[5] * pf2[5] + pf1[8] * pf2[8]; - - if (pfres2 != pfres) memcpy(pfres, pfres2, 9*sizeof(T)); - - return pfres; -} - -template -inline T* _multtrans4(T* pfres, const T* pf1, const T* pf2) -{ - T* pfres2; - - if (pfres == pf1) - pfres2 = (T*)alloca(16 * sizeof(T)); - else - pfres2 = pfres; - - for (int i = 0; i < 4; ++i) - { - for (int j = 0; j < 4; ++j) - { - pfres[4*i+j] = pf1[i] * pf2[j] + pf1[i+4] * pf2[j+4] + pf1[i+8] * pf2[j+8] + pf1[i+12] * pf2[j+12]; - } - } - - return pfres; -} - -inline dReal* multtrans3(dReal* pfres, const dReal* pf1, const dReal* pf2) { return _multtrans3(pfres, pf1, pf2); } -inline double* multtrans3(double* pfres, const double* pf1, const double* pf2) { return _multtrans3(pfres, pf1, pf2); } -inline dReal* multtrans4(dReal* pfres, const dReal* pf1, const dReal* pf2) { return _multtrans4(pfres, pf1, pf2); } -inline double* multtrans4(double* pfres, const double* pf1, const double* pf2) { return _multtrans4(pfres, pf1, pf2); } - -// stride is in T -template inline T* _inv3(const T* pf, T* pfres, int stride) -{ - T* pfres2; - - if (pfres == pf) - pfres2 = (T*)alloca(3 * stride * sizeof(T)); - else - pfres2 = pfres; - - // inverse = C^t / det(pf) where C is the matrix of coefficients - - // calc C^t - pfres2[0*stride + 0] = pf[1*stride + 1] * pf[2*stride + 2] - pf[1*stride + 2] * pf[2*stride + 1]; - pfres2[0*stride + 1] = pf[0*stride + 2] * pf[2*stride + 1] - pf[0*stride + 1] * pf[2*stride + 2]; - pfres2[0*stride + 2] = pf[0*stride + 1] * pf[1*stride + 2] - pf[0*stride + 2] * pf[1*stride + 1]; - - pfres2[1*stride + 0] = pf[1*stride + 2] * pf[2*stride + 0] - pf[1*stride + 0] * pf[2*stride + 2]; - pfres2[1*stride + 1] = pf[0*stride + 0] * pf[2*stride + 2] - pf[0*stride + 2] * pf[2*stride + 0]; - pfres2[1*stride + 2] = pf[0*stride + 2] * pf[1*stride + 0] - pf[0*stride + 0] * pf[1*stride + 2]; - - pfres2[2*stride + 0] = pf[1*stride + 0] * pf[2*stride + 1] - pf[1*stride + 1] * pf[2*stride + 0]; - pfres2[2*stride + 1] = pf[0*stride + 1] * pf[2*stride + 0] - pf[0*stride + 0] * pf[2*stride + 1]; - pfres2[2*stride + 2] = pf[0*stride + 0] * pf[1*stride + 1] - pf[0*stride + 1] * pf[1*stride + 0]; - - T fdet = pf[0*stride + 2] * pfres2[2*stride + 0] + pf[1*stride + 2] * pfres2[2*stride + 1] + - pf[2*stride + 2] * pfres2[2*stride + 2]; - - if (fabs(fdet) < 1e-6) return NULL; - - fdet = 1 / fdet; - - //if( pfdet != NULL ) *pfdet = fdet; - - if (pfres != pf) - { - pfres[0*stride+0] *= fdet; - pfres[0*stride+1] *= fdet; - pfres[0*stride+2] *= fdet; - pfres[1*stride+0] *= fdet; - pfres[1*stride+1] *= fdet; - pfres[1*stride+2] *= fdet; - pfres[2*stride+0] *= fdet; - pfres[2*stride+1] *= fdet; - pfres[2*stride+2] *= fdet; - return pfres; - } - - pfres[0*stride+0] = pfres2[0*stride+0] * fdet; - - pfres[0*stride+1] = pfres2[0*stride+1] * fdet; - pfres[0*stride+2] = pfres2[0*stride+2] * fdet; - pfres[1*stride+0] = pfres2[1*stride+0] * fdet; - pfres[1*stride+1] = pfres2[1*stride+1] * fdet; - pfres[1*stride+2] = pfres2[1*stride+2] * fdet; - pfres[2*stride+0] = pfres2[2*stride+0] * fdet; - pfres[2*stride+1] = pfres2[2*stride+1] * fdet; - pfres[2*stride+2] = pfres2[2*stride+2] * fdet; - return pfres; -} - -inline dReal* inv3(const dReal* pf, dReal* pfres, int stride) { return _inv3(pf, pfres, stride); } - -// inverse if 92 mults and 39 adds -template inline T* _inv4(const T* pf, T* pfres) -{ - T* pfres2; - - if (pfres == pf) - pfres2 = (T*)alloca(16 * sizeof(T)); - else - pfres2 = pfres; - - // inverse = C^t / det(pf) where C is the matrix of coefficients - - // calc C^t - - // determinants of all possibel 2x2 submatrices formed by last two rows - T fd0, fd1, fd2; - - T f1, f2, f3; - - fd0 = pf[2*4 + 0] * pf[3*4 + 1] - pf[2*4 + 1] * pf[3*4 + 0]; - fd1 = pf[2*4 + 1] * pf[3*4 + 2] - pf[2*4 + 2] * pf[3*4 + 1]; - fd2 = pf[2*4 + 2] * pf[3*4 + 3] - pf[2*4 + 3] * pf[3*4 + 2]; - - f1 = pf[2*4 + 1] * pf[3*4 + 3] - pf[2*4 + 3] * pf[3*4 + 1]; - f2 = pf[2*4 + 0] * pf[3*4 + 3] - pf[2*4 + 3] * pf[3*4 + 0]; - f3 = pf[2*4 + 0] * pf[3*4 + 2] - pf[2*4 + 2] * pf[3*4 + 0]; - - pfres2[0*4 + 0] = pf[1*4 + 1] * fd2 - pf[1*4 + 2] * f1 + pf[1*4 + 3] * fd1; - pfres2[0*4 + 1] = -(pf[0*4 + 1] * fd2 - pf[0*4 + 2] * f1 + pf[0*4 + 3] * fd1); - - pfres2[1*4 + 0] = -(pf[1*4 + 0] * fd2 - pf[1*4 + 2] * f2 + pf[1*4 + 3] * f3); - pfres2[1*4 + 1] = pf[0*4 + 0] * fd2 - pf[0*4 + 2] * f2 + pf[0*4 + 3] * f3; - - pfres2[2*4 + 0] = pf[1*4 + 0] * f1 - pf[1*4 + 1] * f2 + pf[1*4 + 3] * fd0; - pfres2[2*4 + 1] = -(pf[0*4 + 0] * f1 - pf[0*4 + 1] * f2 + pf[0*4 + 3] * fd0); - - pfres2[3*4 + 0] = -(pf[1*4 + 0] * fd1 - pf[1*4 + 1] * f3 + pf[1*4 + 2] * fd0); - pfres2[3*4 + 1] = pf[0*4 + 0] * fd1 - pf[0*4 + 1] * f3 + pf[0*4 + 2] * fd0; - - // determinants of first 2 rows of 4x4 matrix - fd0 = pf[0*4 + 0] * pf[1*4 + 1] - pf[0*4 + 1] * pf[1*4 + 0]; - fd1 = pf[0*4 + 1] * pf[1*4 + 2] - pf[0*4 + 2] * pf[1*4 + 1]; - fd2 = pf[0*4 + 2] * pf[1*4 + 3] - pf[0*4 + 3] * pf[1*4 + 2]; - - f1 = pf[0*4 + 1] * pf[1*4 + 3] - pf[0*4 + 3] * pf[1*4 + 1]; - f2 = pf[0*4 + 0] * pf[1*4 + 3] - pf[0*4 + 3] * pf[1*4 + 0]; - f3 = pf[0*4 + 0] * pf[1*4 + 2] - pf[0*4 + 2] * pf[1*4 + 0]; - - pfres2[0*4 + 2] = pf[3*4 + 1] * fd2 - pf[3*4 + 2] * f1 + pf[3*4 + 3] * fd1; - pfres2[0*4 + 3] = -(pf[2*4 + 1] * fd2 - pf[2*4 + 2] * f1 + pf[2*4 + 3] * fd1); - - pfres2[1*4 + 2] = -(pf[3*4 + 0] * fd2 - pf[3*4 + 2] * f2 + pf[3*4 + 3] * f3); - pfres2[1*4 + 3] = pf[2*4 + 0] * fd2 - pf[2*4 + 2] * f2 + pf[2*4 + 3] * f3; - - pfres2[2*4 + 2] = pf[3*4 + 0] * f1 - pf[3*4 + 1] * f2 + pf[3*4 + 3] * fd0; - pfres2[2*4 + 3] = -(pf[2*4 + 0] * f1 - pf[2*4 + 1] * f2 + pf[2*4 + 3] * fd0); - - pfres2[3*4 + 2] = -(pf[3*4 + 0] * fd1 - pf[3*4 + 1] * f3 + pf[3*4 + 2] * fd0); - pfres2[3*4 + 3] = pf[2*4 + 0] * fd1 - pf[2*4 + 1] * f3 + pf[2*4 + 2] * fd0; - - T fdet = pf[0*4 + 3] * pfres2[3*4 + 0] + pf[1*4 + 3] * pfres2[3*4 + 1] + - pf[2*4 + 3] * pfres2[3*4 + 2] + pf[3*4 + 3] * pfres2[3*4 + 3]; - - if (fabs(fdet) < 1e-6) return NULL; - - fdet = 1 / fdet; - - //if( pfdet != NULL ) *pfdet = fdet; - - if (pfres2 == pfres) - { - mult(pfres, fdet, 16); - return pfres; - } - - int i = 0; - - while (i < 16) - { - pfres[i] = pfres2[i] * fdet; - ++i; - } - - return pfres; -} - -inline dReal* inv4(const dReal* pf, dReal* pfres) { return _inv4(pf, pfres); } - -template inline T* _transpose3(const T* pf, T* pfres) -{ - assert(pf != NULL && pfres != NULL); - - if (pf == pfres) - { - rswap(pfres[1], pfres[3]); - rswap(pfres[2], pfres[6]); - rswap(pfres[5], pfres[7]); - return pfres; - } - - pfres[0] = pf[0]; - - pfres[1] = pf[3]; - pfres[2] = pf[6]; - pfres[3] = pf[1]; - pfres[4] = pf[4]; - pfres[5] = pf[7]; - pfres[6] = pf[2]; - pfres[7] = pf[5]; - pfres[8] = pf[8]; - - return pfres; -} - -inline dReal* transpose3(const dReal* pf, dReal* pfres) { return _transpose3(pf, pfres); } -inline double* transpose3(const double* pf, double* pfres) { return _transpose3(pf, pfres); } - -template inline T* _transpose4(const T* pf, T* pfres) -{ - assert(pf != NULL && pfres != NULL); - - if (pf == pfres) - { - rswap(pfres[1], pfres[4]); - rswap(pfres[2], pfres[8]); - rswap(pfres[3], pfres[12]); - rswap(pfres[6], pfres[9]); - rswap(pfres[7], pfres[13]); - rswap(pfres[11], pfres[15]); - return pfres; - } - - pfres[0] = pf[0]; - - pfres[1] = pf[4]; - pfres[2] = pf[8]; - pfres[3] = pf[12]; - pfres[4] = pf[1]; - pfres[5] = pf[5]; - pfres[6] = pf[9]; - pfres[7] = pf[13]; - pfres[8] = pf[2]; - pfres[9] = pf[6]; - pfres[10] = pf[10]; - pfres[11] = pf[14]; - pfres[12] = pf[3]; - pfres[13] = pf[7]; - pfres[14] = pf[11]; - pfres[15] = pf[15]; - return pfres; -} - -inline dReal* transpose4(const dReal* pf, dReal* pfres) { return _transpose4(pf, pfres); } -inline double* transpose4(const double* pf, double* pfres) { return _transpose4(pf, pfres); } - -inline dReal dot2(const dReal* pf1, const dReal* pf2) -{ - assert(pf1 != NULL && pf2 != NULL); - return pf1[0]*pf2[0] + pf1[1]*pf2[1]; -} - -inline dReal dot3(const dReal* pf1, const dReal* pf2) -{ - assert(pf1 != NULL && pf2 != NULL); - return pf1[0]*pf2[0] + pf1[1]*pf2[1] + pf1[2]*pf2[2]; -} - -inline dReal dot4(const dReal* pf1, const dReal* pf2) -{ - assert(pf1 != NULL && pf2 != NULL); - return pf1[0]*pf2[0] + pf1[1]*pf2[1] + pf1[2]*pf2[2] + pf1[3] * pf2[3]; -} - -inline dReal lengthsqr2(const dReal* pf) -{ - assert(pf != NULL); - return pf[0] * pf[0] + pf[1] * pf[1]; -} - -inline dReal lengthsqr3(const dReal* pf) -{ - assert(pf != NULL); - return pf[0] * pf[0] + pf[1] * pf[1] + pf[2] * pf[2]; -} - -inline dReal lengthsqr4(const dReal* pf) -{ - assert(pf != NULL); - return pf[0] * pf[0] + pf[1] * pf[1] + pf[2] * pf[2] + pf[3] * pf[3]; -} - -inline dReal* normalize2(dReal* pfout, const dReal* pf) -{ - assert(pf != NULL); - - dReal f = pf[0] * pf[0] + pf[1] * pf[1]; - f = 1.0f / sqrtf(f); - pfout[0] = pf[0] * f; - pfout[1] = pf[1] * f; - - return pfout; -} - -inline dReal* normalize3(dReal* pfout, const dReal* pf) -{ - assert(pf != NULL); - - dReal f = pf[0] * pf[0] + pf[1] * pf[1] + pf[2] * pf[2]; - - f = 1.0f / sqrtf(f); - pfout[0] = pf[0] * f; - pfout[1] = pf[1] * f; - pfout[2] = pf[2] * f; - - return pfout; -} - -inline dReal* normalize4(dReal* pfout, const dReal* pf) -{ - assert(pf != NULL); - - dReal f = pf[0] * pf[0] + pf[1] * pf[1] + pf[2] * pf[2] + pf[3] * pf[3]; - - f = 1.0f / sqrtf(f); - pfout[0] = pf[0] * f; - pfout[1] = pf[1] * f; - pfout[2] = pf[2] * f; - pfout[3] = pf[3] * f; - - return pfout; -} - -inline dReal* cross3(dReal* pfout, const dReal* pf1, const dReal* pf2) -{ - assert(pfout != NULL && pf1 != NULL && pf2 != NULL); - - dReal temp[3]; - temp[0] = pf1[1] * pf2[2] - pf1[2] * pf2[1]; - temp[1] = pf1[2] * pf2[0] - pf1[0] * pf2[2]; - temp[2] = pf1[0] * pf2[1] - pf1[1] * pf2[0]; - - pfout[0] = temp[0]; - pfout[1] = temp[1]; - pfout[2] = temp[2]; - return pfout; -} - -template inline void mult(T* pf, T fa, int r) -{ - assert(pf != NULL); - - while (r > 0) - { - --r; - pf[r] *= fa; - } -} - -template -inline T* mult(T* pf1, R* pf2, int r1, int c1, int c2, S* pfres, bool badd) -{ - assert(pf1 != NULL && pf2 != NULL && pfres != NULL); - int j, k; - - if (!badd) memset(pfres, 0, sizeof(S) * r1 * c2); - - while (r1 > 0) - { - --r1; - - j = 0; - - while (j < c2) - { - k = 0; - - while (k < c1) - { - pfres[j] += pf1[k] * pf2[k*c2 + j]; - ++k; - } - - ++j; - } - - pf1 += c1; - - pfres += c2; - } - - return pfres; -} - -template -inline T* multtrans(T* pf1, R* pf2, int r1, int c1, int c2, S* pfres, bool badd) -{ - assert(pf1 != NULL && pf2 != NULL && pfres != NULL); - int i, j, k; - - if (!badd) memset(pfres, 0, sizeof(S) * c1 * c2); - - i = 0; - - while (i < c1) - { - j = 0; - - while (j < c2) - { - k = 0; - - while (k < r1) - { - pfres[j] += pf1[k*c1] * pf2[k*c2 + j]; - ++k; - } - - ++j; - } - - pfres += c2; - - ++pf1; - - ++i; - } - - return pfres; -} - -template -inline T* multtrans_to2(T* pf1, R* pf2, int r1, int c1, int r2, S* pfres, bool badd) -{ - assert(pf1 != NULL && pf2 != NULL && pfres != NULL); - int j, k; - - if (!badd) memset(pfres, 0, sizeof(S) * r1 * r2); - - while (r1 > 0) - { - --r1; - - j = 0; - - while (j < r2) - { - k = 0; - - while (k < c1) - { - pfres[j] += pf1[k] * pf2[j*c1 + k]; - ++k; - } - - ++j; - } - - pf1 += c1; - - pfres += r2; - } - - return pfres; -} - -template inline T* multto1(T* pf1, T* pf2, int r, int c, T* pftemp) -{ - assert(pf1 != NULL && pf2 != NULL); - - int j, k; - bool bdel = false; - - if (pftemp == NULL) - { - pftemp = new T[c]; - bdel = true; - } - - while (r > 0) - { - --r; - - j = 0; - - while (j < c) - { - - pftemp[j] = 0.0; - - k = 0; - - while (k < c) - { - pftemp[j] += pf1[k] * pf2[k*c + j]; - ++k; - } - - ++j; - } - - memcpy(pf1, pftemp, c * sizeof(T)); - - pf1 += c; - } - - if (bdel) delete[] pftemp; - - return pf1; -} - -template inline T* multto2(T* pf1, S* pf2, int r2, int c2, S* pftemp) -{ - assert(pf1 != NULL && pf2 != NULL); - - int i, j, k; - bool bdel = false; - - if (pftemp == NULL) - { - pftemp = new S[r2]; - bdel = true; - } - - // do columns first - j = 0; - - while (j < c2) - { - i = 0; - - while (i < r2) - { - - pftemp[i] = 0.0; - - k = 0; - - while (k < r2) - { - pftemp[i] += pf1[i*r2 + k] * pf2[k*c2 + j]; - ++k; - } - - ++i; - } - - i = 0; - - while (i < r2) - { - *(pf2 + i*c2 + j) = pftemp[i]; - ++i; - } - - ++j; - } - - if (bdel) delete[] pftemp; - - return pf1; -} - -template inline void add(T* pf1, T* pf2, int r) -{ - assert(pf1 != NULL && pf2 != NULL); - - while (r > 0) - { - --r; - pf1[r] += pf2[r]; - } -} - -template inline void sub(T* pf1, T* pf2, int r) -{ - assert(pf1 != NULL && pf2 != NULL); - - while (r > 0) - { - --r; - pf1[r] -= pf2[r]; - } -} - -template inline T normsqr(T* pf1, int r) -{ - assert(pf1 != NULL); - - T d = 0.0; - - while (r > 0) - { - --r; - d += pf1[r] * pf1[r]; - } - - return d; -} - -template inline T lengthsqr(T* pf1, T* pf2, int length) -{ - T d = 0; - - while (length > 0) - { - --length; - d += sqr(pf1[length] - pf2[length]); - } - - return d; -} - -template inline T dot(T* pf1, T* pf2, int length) -{ - T d = 0; - - while (length > 0) - { - --length; - d += pf1[length] * pf2[length]; - } - - return d; -} - -template inline T sum(T* pf, int length) -{ - T d = 0; - - while (length > 0) - { - --length; - d += pf[length]; - } - - return d; -} - -template inline bool inv2(T* pf, T* pfres) -{ - T fdet = pf[0] * pf[3] - pf[1] * pf[2]; - - if (fabs(fdet) < 1e-16) return false; - - fdet = 1 / fdet; - - //if( pfdet != NULL ) *pfdet = fdet; - - if (pfres != pf) - { - pfres[0] = fdet * pf[3]; - pfres[1] = -fdet * pf[1]; - pfres[2] = -fdet * pf[2]; - pfres[3] = fdet * pf[0]; - return true; - } - - dReal ftemp = pf[0]; - - pfres[0] = pf[3] * fdet; - pfres[1] *= -fdet; - pfres[2] *= -fdet; - pfres[3] = ftemp * pf[0]; - - return true; -} - #endif