2010-04-25 00:31:27 +00:00
/*
2009-02-09 21:15:56 +00:00
* Copyright ( C ) 2007 - 2009 Gabest
* http : //www.gabest.org
*
* This Program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 , or ( at your option )
* any later version .
2010-04-25 00:31:27 +00:00
*
2009-02-09 21:15:56 +00:00
* This Program is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
2010-04-25 00:31:27 +00:00
*
2009-02-09 21:15:56 +00:00
* You should have received a copy of the GNU General Public License
* along with GNU Make ; see the file COPYING . If not , write to
2012-09-09 18:16:11 +00:00
* the Free Software Foundation , Inc . , 51 Franklin Street , Fifth Floor , Boston , MA 02110 - 1301 , USA USA .
2009-02-09 21:15:56 +00:00
* http : //www.gnu.org/copyleft/gpl.html
*
*/
# pragma once
# include "GS.h"
# include "GSTables.h"
# include "GSVector.h"
class GSBlock
{
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
2016-07-06 23:21:31 +00:00
static GSVector8i m_r16mask ;
2013-06-10 15:56:11 +00:00
# else
2016-07-06 23:21:31 +00:00
static GSVector4i m_r16mask ;
2013-06-10 15:56:11 +00:00
# endif
2016-07-06 23:21:31 +00:00
static GSVector4i m_r8mask ;
static GSVector4i m_r4mask ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
2016-07-06 23:21:31 +00:00
static GSVector8i m_xxxa ;
static GSVector8i m_xxbx ;
static GSVector8i m_xgxx ;
static GSVector8i m_rxxx ;
2013-06-10 15:56:11 +00:00
# else
2016-07-06 23:21:31 +00:00
static GSVector4i m_xxxa ;
static GSVector4i m_xxbx ;
static GSVector4i m_xgxx ;
static GSVector4i m_rxxx ;
2013-06-10 15:56:11 +00:00
# endif
2009-02-09 21:15:56 +00:00
2016-07-06 23:21:31 +00:00
static GSVector4i m_uw8hmask0 ;
static GSVector4i m_uw8hmask1 ;
static GSVector4i m_uw8hmask2 ;
static GSVector4i m_uw8hmask3 ;
2009-02-09 21:15:56 +00:00
public :
2016-07-06 23:21:31 +00:00
static void InitVectors ( ) ;
2013-06-17 04:11:10 +00:00
template < int i , int alignment , uint32 mask > __forceinline static void WriteColumn32 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
const uint8 * RESTRICT s0 = & src [ srcpitch * 0 ] ;
const uint8 * RESTRICT s1 = & src [ srcpitch * 1 ] ;
# if _M_SSE >= 0x501
GSVector8i v0 , v1 ;
if ( alignment = = 32 )
{
v0 = GSVector8i : : load < true > ( s0 ) . acbd ( ) ;
v1 = GSVector8i : : load < true > ( s1 ) . acbd ( ) ;
GSVector8i : : sw64 ( v0 , v1 ) ;
}
else
{
if ( alignment = = 16 )
{
v0 = GSVector8i : : load ( & s0 [ 0 ] , & s0 [ 16 ] ) . acbd ( ) ;
v1 = GSVector8i : : load ( & s1 [ 0 ] , & s1 [ 16 ] ) . acbd ( ) ;
GSVector8i : : sw64 ( v0 , v1 ) ;
}
else
{
//v0 = GSVector8i::load(&s0[0], &s0[16], &s0[8], &s0[24]);
//v1 = GSVector8i::load(&s1[0], &s1[16], &s1[8], &s1[24]);
GSVector4i v4 = GSVector4i : : load ( & s0 [ 0 ] , & s1 [ 0 ] ) ;
GSVector4i v5 = GSVector4i : : load ( & s0 [ 8 ] , & s1 [ 8 ] ) ;
GSVector4i v6 = GSVector4i : : load ( & s0 [ 16 ] , & s1 [ 16 ] ) ;
GSVector4i v7 = GSVector4i : : load ( & s0 [ 24 ] , & s1 [ 24 ] ) ;
if ( mask = = 0xffffffff )
{
// just write them out directly
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = v4 ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = v5 ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = v6 ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = v7 ;
return ;
}
v0 = GSVector8i : : cast ( v4 ) . insert < 1 > ( v5 ) ;
v1 = GSVector8i : : cast ( v6 ) . insert < 1 > ( v7 ) ;
}
}
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
if ( mask = = 0xffffffff )
{
( ( GSVector8i * ) dst ) [ i * 2 + 0 ] = v0 ;
( ( GSVector8i * ) dst ) [ i * 2 + 1 ] = v1 ;
}
else
2012-01-08 17:10:00 +00:00
{
2013-06-17 04:11:10 +00:00
GSVector8i v2 ( ( int ) mask ) ;
if ( mask = = 0xff000000 | | mask = = 0x00ffffff )
{
( ( GSVector8i * ) dst ) [ i * 2 + 0 ] = ( ( GSVector8i * ) dst ) [ i * 2 + 0 ] . blend8 ( v0 , v2 ) ;
( ( GSVector8i * ) dst ) [ i * 2 + 1 ] = ( ( GSVector8i * ) dst ) [ i * 2 + 1 ] . blend8 ( v1 , v2 ) ;
}
else
{
( ( GSVector8i * ) dst ) [ i * 2 + 0 ] = ( ( GSVector8i * ) dst ) [ i * 2 + 0 ] . blend ( v0 , v2 ) ;
( ( GSVector8i * ) dst ) [ i * 2 + 1 ] = ( ( GSVector8i * ) dst ) [ i * 2 + 1 ] . blend ( v1 , v2 ) ;
}
}
# else
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i v0 , v1 , v2 , v3 ;
if ( alignment ! = 0 )
{
v0 = GSVector4i : : load < true > ( & s0 [ 0 ] ) ;
v1 = GSVector4i : : load < true > ( & s0 [ 16 ] ) ;
v2 = GSVector4i : : load < true > ( & s1 [ 0 ] ) ;
v3 = GSVector4i : : load < true > ( & s1 [ 16 ] ) ;
2012-01-08 17:10:00 +00:00
GSVector4i : : sw64 ( v0 , v2 , v1 , v3 ) ;
}
else
{
v0 = GSVector4i : : load ( & s0 [ 0 ] , & s1 [ 0 ] ) ;
v1 = GSVector4i : : load ( & s0 [ 8 ] , & s1 [ 8 ] ) ;
v2 = GSVector4i : : load ( & s0 [ 16 ] , & s1 [ 16 ] ) ;
v3 = GSVector4i : : load ( & s0 [ 24 ] , & s1 [ 24 ] ) ;
}
2009-02-09 21:15:56 +00:00
if ( mask = = 0xffffffff )
{
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = v0 ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = v1 ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = v2 ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = v3 ;
}
else
{
GSVector4i v4 ( ( int ) mask ) ;
# if _M_SSE >= 0x401
if ( mask = = 0xff000000 | | mask = = 0x00ffffff )
{
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 0 ] . blend8 ( v0 , v4 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 1 ] . blend8 ( v1 , v4 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 2 ] . blend8 ( v2 , v4 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 3 ] . blend8 ( v3 , v4 ) ;
}
else
{
# endif
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 0 ] . blend ( v0 , v4 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 1 ] . blend ( v1 , v4 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 2 ] . blend ( v2 , v4 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 3 ] . blend ( v3 , v4 ) ;
# if _M_SSE >= 0x401
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
}
# endif
}
2013-06-17 04:11:10 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
template < int i , int alignment > __forceinline static void WriteColumn16 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
const uint8 * RESTRICT s0 = & src [ srcpitch * 0 ] ;
const uint8 * RESTRICT s1 = & src [ srcpitch * 1 ] ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
// for(int j = 0; j < 16; j++) {((uint16*)s0)[j] = columnTable16[0][j]; ((uint16*)s1)[j] = columnTable16[1][j];}
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# if _M_SSE >= 0x501
GSVector8i v0 , v1 ;
if ( alignment = = 32 )
{
v0 = GSVector8i : : load < true > ( s0 ) ;
v1 = GSVector8i : : load < true > ( s1 ) ;
GSVector8i : : sw128 ( v0 , v1 ) ;
GSVector8i : : sw16 ( v0 , v1 ) ;
}
else
{
if ( alignment = = 16 )
{
v0 = GSVector8i : : load ( & s0 [ 0 ] , & s1 [ 0 ] ) ;
v1 = GSVector8i : : load ( & s0 [ 16 ] , & s1 [ 16 ] ) ;
}
else
{
v0 = GSVector8i : : load ( & s0 [ 0 ] , & s0 [ 8 ] , & s1 [ 0 ] , & s1 [ 8 ] ) ;
v1 = GSVector8i : : load ( & s0 [ 16 ] , & s0 [ 24 ] , & s1 [ 16 ] , & s1 [ 24 ] ) ;
}
GSVector8i : : sw16 ( v0 , v1 ) ;
}
v0 = v0 . acbd ( ) ;
v1 = v1 . acbd ( ) ;
( ( GSVector8i * ) dst ) [ i * 2 + 0 ] = v0 ;
( ( GSVector8i * ) dst ) [ i * 2 + 1 ] = v1 ;
# else
GSVector4i v0 , v1 , v2 , v3 ;
if ( alignment ! = 0 )
{
v0 = GSVector4i : : load < true > ( & s0 [ 0 ] ) ;
v1 = GSVector4i : : load < true > ( & s0 [ 16 ] ) ;
v2 = GSVector4i : : load < true > ( & s1 [ 0 ] ) ;
v3 = GSVector4i : : load < true > ( & s1 [ 16 ] ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
}
else
{
v0 = GSVector4i : : loadl ( & s0 [ 0 ] ) . upl16 ( GSVector4i : : loadl ( & s0 [ 16 ] ) ) ;
v2 = GSVector4i : : loadl ( & s0 [ 8 ] ) . upl16 ( GSVector4i : : loadl ( & s0 [ 24 ] ) ) ;
v1 = GSVector4i : : loadl ( & s1 [ 0 ] ) . upl16 ( GSVector4i : : loadl ( & s1 [ 16 ] ) ) ;
v3 = GSVector4i : : loadl ( & s1 [ 8 ] ) . upl16 ( GSVector4i : : loadl ( & s1 [ 24 ] ) ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
}
2009-02-09 21:15:56 +00:00
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = v0 ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = v2 ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = v1 ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = v3 ;
2013-06-17 04:11:10 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
template < int i , int alignment > __forceinline static void WriteColumn8 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
// TODO: read unaligned as WriteColumn32 does and try saving a few shuffles
# if _M_SSE >= 0x501
GSVector4i v4 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 0 ] ) ;
GSVector4i v5 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 1 ] ) ;
GSVector4i v6 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 2 ] ) ;
GSVector4i v7 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 3 ] ) ;
GSVector8i v0 ( v4 , v5 ) ;
GSVector8i v1 ( v6 , v7 ) ;
if ( ( i & 1 ) = = 0 )
{
v1 = v1 . yxwz ( ) ;
}
else
{
v0 = v0 . yxwz ( ) ;
}
GSVector8i : : sw8 ( v0 , v1 ) ;
GSVector8i : : sw16 ( v0 , v1 ) ;
v0 = v0 . acbd ( ) ;
v1 = v1 . acbd ( ) ;
( ( GSVector8i * ) dst ) [ i * 2 + 0 ] = v0 ;
( ( GSVector8i * ) dst ) [ i * 2 + 1 ] = v1 ;
# else
GSVector4i v0 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 0 ] ) ;
GSVector4i v1 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 1 ] ) ;
GSVector4i v2 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 2 ] ) ;
GSVector4i v3 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 3 ] ) ;
2009-02-09 21:15:56 +00:00
if ( ( i & 1 ) = = 0 )
{
v2 = v2 . yxwz ( ) ;
v3 = v3 . yxwz ( ) ;
}
else
{
v0 = v0 . yxwz ( ) ;
v1 = v1 . yxwz ( ) ;
}
GSVector4i : : sw8 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = v0 ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = v2 ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = v1 ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = v3 ;
2013-06-17 04:11:10 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
template < int i , int alignment > __forceinline static void WriteColumn4 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("WriteColumn4\n");
// TODO: read unaligned as WriteColumn32 does and try saving a few shuffles
2009-02-09 21:15:56 +00:00
// TODO: pshufb
2013-06-17 04:11:10 +00:00
GSVector4i v0 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 0 ] ) ;
GSVector4i v1 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 1 ] ) ;
GSVector4i v2 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 2 ] ) ;
GSVector4i v3 = GSVector4i : : load < alignment ! = 0 > ( & src [ srcpitch * 3 ] ) ;
2009-02-09 21:15:56 +00:00
if ( ( i & 1 ) = = 0 )
{
v2 = v2 . yxwzlh ( ) ;
v3 = v3 . yxwzlh ( ) ;
}
else
{
v0 = v0 . yxwzlh ( ) ;
v1 = v1 . yxwzlh ( ) ;
}
GSVector4i : : sw4 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw8 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw64 ( v0 , v2 , v1 , v3 ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = v0 ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = v1 ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = v2 ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = v3 ;
}
2013-06-17 04:11:10 +00:00
template < int alignment , uint32 mask > static void WriteColumn32 ( int y , uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 1 ) & 3 )
{
2013-06-17 04:11:10 +00:00
case 0 : WriteColumn32 < 0 , alignment , mask > ( dst , src , srcpitch ) ; break ;
case 1 : WriteColumn32 < 1 , alignment , mask > ( dst , src , srcpitch ) ; break ;
case 2 : WriteColumn32 < 2 , alignment , mask > ( dst , src , srcpitch ) ; break ;
case 3 : WriteColumn32 < 3 , alignment , mask > ( dst , src , srcpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-17 04:11:10 +00:00
template < int alignment > static void WriteColumn16 ( int y , uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 1 ) & 3 )
{
2013-06-17 04:11:10 +00:00
case 0 : WriteColumn16 < 0 , alignment > ( dst , src , srcpitch ) ; break ;
case 1 : WriteColumn16 < 1 , alignment > ( dst , src , srcpitch ) ; break ;
case 2 : WriteColumn16 < 2 , alignment > ( dst , src , srcpitch ) ; break ;
case 3 : WriteColumn16 < 3 , alignment > ( dst , src , srcpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-17 04:11:10 +00:00
template < int alignment > static void WriteColumn8 ( int y , uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 2 ) & 3 )
{
2013-06-17 04:11:10 +00:00
case 0 : WriteColumn8 < 0 , alignment > ( dst , src , srcpitch ) ; break ;
case 1 : WriteColumn8 < 1 , alignment > ( dst , src , srcpitch ) ; break ;
case 2 : WriteColumn8 < 2 , alignment > ( dst , src , srcpitch ) ; break ;
case 3 : WriteColumn8 < 3 , alignment > ( dst , src , srcpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-17 04:11:10 +00:00
template < int alignment > static void WriteColumn4 ( int y , uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 2 ) & 3 )
{
2013-06-17 04:11:10 +00:00
case 0 : WriteColumn4 < 0 , alignment > ( dst , src , srcpitch ) ; break ;
case 1 : WriteColumn4 < 1 , alignment > ( dst , src , srcpitch ) ; break ;
case 2 : WriteColumn4 < 2 , alignment > ( dst , src , srcpitch ) ; break ;
case 3 : WriteColumn4 < 3 , alignment > ( dst , src , srcpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-17 04:11:10 +00:00
template < int alignment , uint32 mask > static void WriteBlock32 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
WriteColumn32 < 0 , alignment , mask > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 2 ;
2013-06-17 04:11:10 +00:00
WriteColumn32 < 1 , alignment , mask > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 2 ;
2013-06-17 04:11:10 +00:00
WriteColumn32 < 2 , alignment , mask > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 2 ;
2013-06-17 04:11:10 +00:00
WriteColumn32 < 3 , alignment , mask > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
template < int alignment > static void WriteBlock16 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
WriteColumn16 < 0 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 2 ;
2013-06-17 04:11:10 +00:00
WriteColumn16 < 1 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 2 ;
2013-06-17 04:11:10 +00:00
WriteColumn16 < 2 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 2 ;
2013-06-17 04:11:10 +00:00
WriteColumn16 < 3 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
template < int alignment > static void WriteBlock8 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
WriteColumn8 < 0 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 4 ;
2013-06-17 04:11:10 +00:00
WriteColumn8 < 1 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 4 ;
2013-06-17 04:11:10 +00:00
WriteColumn8 < 2 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 4 ;
2013-06-17 04:11:10 +00:00
WriteColumn8 < 3 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
template < int alignment > static void WriteBlock4 ( uint8 * RESTRICT dst , const uint8 * RESTRICT src , int srcpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
WriteColumn4 < 0 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 4 ;
2013-06-17 04:11:10 +00:00
WriteColumn4 < 1 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 4 ;
2013-06-17 04:11:10 +00:00
WriteColumn4 < 2 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
src + = srcpitch * 4 ;
2013-06-17 04:11:10 +00:00
WriteColumn4 < 3 , alignment > ( dst , src , srcpitch ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
template < int i > __forceinline static void ReadColumn32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
const GSVector8i * s = ( const GSVector8i * ) src ;
2012-01-08 17:10:00 +00:00
2013-06-10 15:56:11 +00:00
GSVector8i v0 = s [ i * 2 + 0 ] ;
GSVector8i v1 = s [ i * 2 + 1 ] ;
2012-01-08 17:10:00 +00:00
2013-06-10 15:56:11 +00:00
GSVector8i : : sw128 ( v0 , v1 ) ;
GSVector8i : : sw64 ( v0 , v1 ) ;
GSVector8i : : store < true > ( & dst [ dstpitch * 0 ] , v0 ) ;
GSVector8i : : store < true > ( & dst [ dstpitch * 1 ] , v1 ) ;
# else
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 = s [ i * 4 + 0 ] ;
GSVector4i v1 = s [ i * 4 + 1 ] ;
GSVector4i v2 = s [ i * 4 + 2 ] ;
GSVector4i v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
2009-02-09 21:15:56 +00:00
GSVector4i * d0 = ( GSVector4i * ) & dst [ dstpitch * 0 ] ;
GSVector4i * d1 = ( GSVector4i * ) & dst [ dstpitch * 1 ] ;
2013-06-10 15:56:11 +00:00
GSVector4i : : store < true > ( & d0 [ 0 ] , v0 ) ;
GSVector4i : : store < true > ( & d0 [ 1 ] , v1 ) ;
GSVector4i : : store < true > ( & d1 [ 0 ] , v2 ) ;
GSVector4i : : store < true > ( & d1 [ 1 ] , v3 ) ;
# endif
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
template < int i > __forceinline static void ReadColumn16 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
2013-07-01 21:28:58 +00:00
2013-06-10 15:56:11 +00:00
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i v0 = s [ i * 2 + 0 ] . shuffle8 ( m_r16mask ) ;
GSVector8i v1 = s [ i * 2 + 1 ] . shuffle8 ( m_r16mask ) ;
GSVector8i : : sw128 ( v0 , v1 ) ;
GSVector8i : : sw32 ( v0 , v1 ) ;
v0 = v0 . acbd ( ) ;
v1 = v1 . acbd ( ) ;
GSVector8i : : store < true > ( & dst [ dstpitch * 0 ] , v0 ) ;
GSVector8i : : store < true > ( & dst [ dstpitch * 1 ] , v1 ) ;
# elif _M_SSE >= 0x301
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 = s [ i * 4 + 0 ] . shuffle8 ( m_r16mask ) ;
GSVector4i v1 = s [ i * 4 + 1 ] . shuffle8 ( m_r16mask ) ;
GSVector4i v2 = s [ i * 4 + 2 ] . shuffle8 ( m_r16mask ) ;
GSVector4i v3 = s [ i * 4 + 3 ] . shuffle8 ( m_r16mask ) ;
GSVector4i : : sw32 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
GSVector4i * d0 = ( GSVector4i * ) & dst [ dstpitch * 0 ] ;
GSVector4i * d1 = ( GSVector4i * ) & dst [ dstpitch * 1 ] ;
2013-06-10 15:56:11 +00:00
GSVector4i : : store < true > ( & d0 [ 0 ] , v0 ) ;
GSVector4i : : store < true > ( & d0 [ 1 ] , v2 ) ;
GSVector4i : : store < true > ( & d1 [ 0 ] , v1 ) ;
GSVector4i : : store < true > ( & d1 [ 1 ] , v3 ) ;
2009-02-09 21:15:56 +00:00
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
2010-04-25 00:31:27 +00:00
GSVector4i v0 = s [ i * 4 + 0 ] ;
GSVector4i v1 = s [ i * 4 + 1 ] ;
GSVector4i v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
GSVector4i v3 = s [ i * 4 + 3 ] ;
2013-06-10 15:56:11 +00:00
//for(int16 i = 0; i < 8; i++) {v0.i16[i] = i; v1.i16[i] = i + 8; v2.i16[i] = i + 16; v3.i16[i] = i + 24;}
2009-02-09 21:15:56 +00:00
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw32 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw16 ( v0 , v2 , v1 , v3 ) ;
GSVector4i * d0 = ( GSVector4i * ) & dst [ dstpitch * 0 ] ;
GSVector4i * d1 = ( GSVector4i * ) & dst [ dstpitch * 1 ] ;
2013-06-10 15:56:11 +00:00
GSVector4i : : store < true > ( & d0 [ 0 ] , v0 ) ;
GSVector4i : : store < true > ( & d0 [ 1 ] , v1 ) ;
GSVector4i : : store < true > ( & d1 [ 0 ] , v2 ) ;
GSVector4i : : store < true > ( & d1 [ 1 ] , v3 ) ;
2009-02-09 21:15:56 +00:00
# endif
}
2013-06-10 15:56:11 +00:00
template < int i > __forceinline static void ReadColumn8 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//for(int j = 0; j < 64; j++) ((uint8*)src)[j] = (uint8)j;
#if 0 //_M_SSE >= 0x501
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i v0 = s [ i * 2 + 0 ] ;
GSVector8i v1 = s [ i * 2 + 1 ] ;
GSVector8i : : sw8 ( v0 , v1 ) ;
GSVector8i : : sw16 ( v0 , v1 ) ;
GSVector8i : : sw8 ( v0 , v1 ) ;
GSVector8i : : sw128 ( v0 , v1 ) ;
GSVector8i : : sw16 ( v0 , v1 ) ;
v0 = v0 . acbd ( ) ;
v1 = v1 . acbd ( ) ;
v1 = v1 . yxwz ( ) ;
GSVector8i : : storel ( & dst [ dstpitch * 0 ] , v0 ) ;
GSVector8i : : storeh ( & dst [ dstpitch * 1 ] , v0 ) ;
GSVector8i : : storel ( & dst [ dstpitch * 2 ] , v1 ) ;
GSVector8i : : storeh ( & dst [ dstpitch * 3 ] , v1 ) ;
// TODO: not sure if this is worth it, not in this form, there should be a shorter path
# elif _M_SSE >= 0x301
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
if ( ( i & 1 ) = = 0 )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 4 + 0 ] ;
v1 = s [ i * 4 + 1 ] ;
v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 4 + 3 ] ;
}
else
{
2010-04-25 00:31:27 +00:00
v2 = s [ i * 4 + 0 ] ;
v3 = s [ i * 4 + 1 ] ;
v0 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
v1 = s [ i * 4 + 3 ] ;
}
v0 = v0 . shuffle8 ( m_r8mask ) ;
v1 = v1 . shuffle8 ( m_r8mask ) ;
v2 = v2 . shuffle8 ( m_r8mask ) ;
v3 = v3 . shuffle8 ( m_r8mask ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw32 ( v0 , v1 , v3 , v2 ) ;
2013-06-10 15:56:11 +00:00
GSVector4i : : store < true > ( & dst [ dstpitch * 0 ] , v0 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 ] , v3 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 2 ] , v1 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 3 ] , v2 ) ;
2009-02-09 21:15:56 +00:00
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
2010-04-25 00:31:27 +00:00
GSVector4i v0 = s [ i * 4 + 0 ] ;
GSVector4i v1 = s [ i * 4 + 1 ] ;
GSVector4i v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
GSVector4i v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw8 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
if ( ( i & 1 ) = = 0 )
{
v2 = v2 . yxwz ( ) ;
v3 = v3 . yxwz ( ) ;
}
else
{
v0 = v0 . yxwz ( ) ;
v1 = v1 . yxwz ( ) ;
2010-04-25 00:31:27 +00:00
}
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
GSVector4i : : store < true > ( & dst [ dstpitch * 0 ] , v0 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 ] , v1 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 2 ] , v2 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 3 ] , v3 ) ;
2009-02-09 21:15:56 +00:00
# endif
}
2010-04-25 00:31:27 +00:00
2013-06-10 15:56:11 +00:00
template < int i > __forceinline static void ReadColumn4 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("ReadColumn4\n");
2009-02-09 21:15:56 +00:00
# if _M_SSE >= 0x301
const GSVector4i * s = ( const GSVector4i * ) src ;
2010-04-25 00:31:27 +00:00
GSVector4i v0 = s [ i * 4 + 0 ] . xzyw ( ) ;
GSVector4i v1 = s [ i * 4 + 1 ] . xzyw ( ) ;
GSVector4i v2 = s [ i * 4 + 2 ] . xzyw ( ) ;
2009-02-09 21:15:56 +00:00
GSVector4i v3 = s [ i * 4 + 3 ] . xzyw ( ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw4 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
v0 = v0 . shuffle8 ( m_r4mask ) ;
v1 = v1 . shuffle8 ( m_r4mask ) ;
v2 = v2 . shuffle8 ( m_r4mask ) ;
v3 = v3 . shuffle8 ( m_r4mask ) ;
if ( ( i & 1 ) = = 0 )
{
GSVector4i : : sw16rh ( v0 , v1 , v2 , v3 ) ;
}
else
{
GSVector4i : : sw16rl ( v0 , v1 , v2 , v3 ) ;
}
2013-06-10 15:56:11 +00:00
GSVector4i : : store < true > ( & dst [ dstpitch * 0 ] , v0 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 ] , v1 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 2 ] , v2 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 3 ] , v3 ) ;
2009-02-09 21:15:56 +00:00
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
2010-04-25 00:31:27 +00:00
GSVector4i v0 = s [ i * 4 + 0 ] ;
GSVector4i v1 = s [ i * 4 + 1 ] ;
GSVector4i v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
GSVector4i v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw32 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw32 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw4 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw16 ( v0 , v2 , v1 , v3 ) ;
v0 = v0 . xzyw ( ) ;
v1 = v1 . xzyw ( ) ;
v2 = v2 . xzyw ( ) ;
v3 = v3 . xzyw ( ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
if ( ( i & 1 ) = = 0 )
{
v2 = v2 . yxwzlh ( ) ;
v3 = v3 . yxwzlh ( ) ;
}
else
{
v0 = v0 . yxwzlh ( ) ;
v1 = v1 . yxwzlh ( ) ;
}
2013-06-10 15:56:11 +00:00
GSVector4i : : store < true > ( & dst [ dstpitch * 0 ] , v0 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 ] , v1 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 2 ] , v2 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 3 ] , v3 ) ;
2009-02-09 21:15:56 +00:00
# endif
}
2013-06-10 15:56:11 +00:00
static void ReadColumn32 ( int y , const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 1 ) & 3 )
{
2013-06-10 15:56:11 +00:00
case 0 : ReadColumn32 < 0 > ( src , dst , dstpitch ) ; break ;
case 1 : ReadColumn32 < 1 > ( src , dst , dstpitch ) ; break ;
case 2 : ReadColumn32 < 2 > ( src , dst , dstpitch ) ; break ;
case 3 : ReadColumn32 < 3 > ( src , dst , dstpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-10 15:56:11 +00:00
static void ReadColumn16 ( int y , const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 1 ) & 3 )
{
2013-06-10 15:56:11 +00:00
case 0 : ReadColumn16 < 0 > ( src , dst , dstpitch ) ; break ;
case 1 : ReadColumn16 < 1 > ( src , dst , dstpitch ) ; break ;
case 2 : ReadColumn16 < 2 > ( src , dst , dstpitch ) ; break ;
case 3 : ReadColumn16 < 3 > ( src , dst , dstpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-10 15:56:11 +00:00
static void ReadColumn8 ( int y , const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 2 ) & 3 )
{
2013-06-10 15:56:11 +00:00
case 0 : ReadColumn8 < 0 > ( src , dst , dstpitch ) ; break ;
case 1 : ReadColumn8 < 1 > ( src , dst , dstpitch ) ; break ;
case 2 : ReadColumn8 < 2 > ( src , dst , dstpitch ) ; break ;
case 3 : ReadColumn8 < 3 > ( src , dst , dstpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-10 15:56:11 +00:00
static void ReadColumn4 ( int y , const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
switch ( ( y > > 2 ) & 3 )
{
2013-06-10 15:56:11 +00:00
case 0 : ReadColumn4 < 0 > ( src , dst , dstpitch ) ; break ;
case 1 : ReadColumn4 < 1 > ( src , dst , dstpitch ) ; break ;
case 2 : ReadColumn4 < 2 > ( src , dst , dstpitch ) ; break ;
case 3 : ReadColumn4 < 3 > ( src , dst , dstpitch ) ; break ;
2009-02-09 21:15:56 +00:00
default : __assume ( 0 ) ;
}
}
2013-06-10 15:56:11 +00:00
static void ReadBlock32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
ReadColumn32 < 0 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 2 ;
2013-06-10 15:56:11 +00:00
ReadColumn32 < 1 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 2 ;
2013-06-10 15:56:11 +00:00
ReadColumn32 < 2 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 2 ;
2013-06-10 15:56:11 +00:00
ReadColumn32 < 3 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
static void ReadBlock16 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
ReadColumn16 < 0 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 2 ;
2013-06-10 15:56:11 +00:00
ReadColumn16 < 1 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 2 ;
2013-06-10 15:56:11 +00:00
ReadColumn16 < 2 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 2 ;
2013-06-10 15:56:11 +00:00
ReadColumn16 < 3 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
static void ReadBlock8 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
ReadColumn8 < 0 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 4 ;
2013-06-10 15:56:11 +00:00
ReadColumn8 < 1 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 4 ;
2013-06-10 15:56:11 +00:00
ReadColumn8 < 2 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 4 ;
2013-06-10 15:56:11 +00:00
ReadColumn8 < 3 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
static void ReadBlock4 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
ReadColumn4 < 0 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 4 ;
2013-06-10 15:56:11 +00:00
ReadColumn4 < 1 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 4 ;
2013-06-10 15:56:11 +00:00
ReadColumn4 < 2 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
dst + = dstpitch * 4 ;
2013-06-10 15:56:11 +00:00
ReadColumn4 < 3 > ( src , dst , dstpitch ) ;
2009-02-09 21:15:56 +00:00
}
2009-05-14 16:41:52 +00:00
__forceinline static void ReadBlock4P ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("ReadBlock4P\n");
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask ( 0x0f0f0f0f ) ;
for ( int i = 0 ; i < 2 ; i + + )
{
// col 0, 2
2010-04-25 00:31:27 +00:00
v0 = s [ i * 8 + 0 ] ;
v1 = s [ i * 8 + 1 ] ;
v2 = s [ i * 8 + 2 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 8 + 3 ] ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw8 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 0 ] , ( v0 & mask ) ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 16 ] , ( v1 & mask ) ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 0 ] , ( v2 & mask ) ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 16 ] , ( v3 & mask ) ) ;
dst + = dstpitch * 2 ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 0 ] , ( v0 . andnot ( mask ) ) . yxwz ( ) > > 4 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 16 ] , ( v1 . andnot ( mask ) ) . yxwz ( ) > > 4 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 0 ] , ( v2 . andnot ( mask ) ) . yxwz ( ) > > 4 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 16 ] , ( v3 . andnot ( mask ) ) . yxwz ( ) > > 4 ) ;
dst + = dstpitch * 2 ;
// col 1, 3
2010-04-25 00:31:27 +00:00
v0 = s [ i * 8 + 4 ] ;
v1 = s [ i * 8 + 5 ] ;
v2 = s [ i * 8 + 6 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 8 + 7 ] ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw8 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 0 ] , ( v0 & mask ) . yxwz ( ) ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 16 ] , ( v1 & mask ) . yxwz ( ) ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 0 ] , ( v2 & mask ) . yxwz ( ) ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 16 ] , ( v3 & mask ) . yxwz ( ) ) ;
dst + = dstpitch * 2 ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 0 ] , ( v0 . andnot ( mask ) ) > > 4 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 0 + 16 ] , ( v1 . andnot ( mask ) ) > > 4 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 0 ] , ( v2 . andnot ( mask ) ) > > 4 ) ;
GSVector4i : : store < true > ( & dst [ dstpitch * 1 + 16 ] , ( v3 . andnot ( mask ) ) > > 4 ) ;
dst + = dstpitch * 2 ;
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ReadBlock8HP ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
# if _M_SSE >= 0x501
uint8 * RESTRICT d0 = & dst [ dstpitch * 0 ] ;
uint8 * RESTRICT d1 = & dst [ dstpitch * 4 ] ;
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i v0 , v1 , v2 , v3 ;
GSVector4i v4 , v5 ;
v0 = s [ 0 ] . acbd ( ) ;
v1 = s [ 1 ] . acbd ( ) ;
v2 = s [ 2 ] . acbd ( ) ;
v3 = s [ 3 ] . acbd ( ) ;
v0 = ( v0 > > 24 ) . ps32 ( v1 > > 24 ) . pu16 ( ( v2 > > 24 ) . ps32 ( v3 > > 24 ) ) ;
v4 = v0 . extract < 0 > ( ) ;
v5 = v0 . extract < 1 > ( ) ;
GSVector4i : : storel ( & d0 [ dstpitch * 0 ] , v4 ) ;
GSVector4i : : storel ( & d0 [ dstpitch * 1 ] , v5 ) ;
GSVector4i : : storeh ( & d0 [ dstpitch * 2 ] , v4 ) ;
GSVector4i : : storeh ( & d0 [ dstpitch * 3 ] , v5 ) ;
v0 = s [ 4 ] . acbd ( ) ;
v1 = s [ 5 ] . acbd ( ) ;
v2 = s [ 6 ] . acbd ( ) ;
v3 = s [ 7 ] . acbd ( ) ;
v0 = ( v0 > > 24 ) . ps32 ( v1 > > 24 ) . pu16 ( ( v2 > > 24 ) . ps32 ( v3 > > 24 ) ) ;
v4 = v0 . extract < 0 > ( ) ;
v5 = v0 . extract < 1 > ( ) ;
GSVector4i : : storel ( & d1 [ dstpitch * 0 ] , v4 ) ;
GSVector4i : : storel ( & d1 [ dstpitch * 1 ] , v5 ) ;
GSVector4i : : storeh ( & d1 [ dstpitch * 2 ] , v4 ) ;
GSVector4i : : storeh ( & d1 [ dstpitch * 3 ] , v5 ) ;
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
for ( int i = 0 ; i < 4 ; i + + )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 4 + 0 ] ;
v1 = s [ i * 4 + 1 ] ;
v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
v0 = ( ( v0 > > 24 ) . ps32 ( v1 > > 24 ) ) . pu16 ( ( v2 > > 24 ) . ps32 ( v3 > > 24 ) ) ;
GSVector4i : : storel ( dst , v0 ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
GSVector4i : : storeh ( dst , v0 ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
}
2013-06-17 04:11:10 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2009-05-14 16:41:52 +00:00
__forceinline static void ReadBlock4HLP ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
# if _M_SSE >= 0x501
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
uint8 * RESTRICT d0 = & dst [ dstpitch * 0 ] ;
uint8 * RESTRICT d1 = & dst [ dstpitch * 4 ] ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i v0 , v1 , v2 , v3 ;
GSVector4i v4 , v5 ;
GSVector8i mask ( 0x0f0f0f0f ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = s [ 0 ] . acbd ( ) ;
v1 = s [ 1 ] . acbd ( ) ;
v2 = s [ 2 ] . acbd ( ) ;
v3 = s [ 3 ] . acbd ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = ( v0 > > 24 ) . ps32 ( v1 > > 24 ) . pu16 ( ( v2 > > 24 ) . ps32 ( v3 > > 24 ) ) & mask ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = v0 . extract < 0 > ( ) ;
v5 = v0 . extract < 1 > ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i : : storel ( & d0 [ dstpitch * 0 ] , v4 ) ;
GSVector4i : : storel ( & d0 [ dstpitch * 1 ] , v5 ) ;
GSVector4i : : storeh ( & d0 [ dstpitch * 2 ] , v4 ) ;
GSVector4i : : storeh ( & d0 [ dstpitch * 3 ] , v5 ) ;
2010-04-25 00:31:27 +00:00
2013-06-17 04:11:10 +00:00
v0 = s [ 4 ] . acbd ( ) ;
v1 = s [ 5 ] . acbd ( ) ;
v2 = s [ 6 ] . acbd ( ) ;
v3 = s [ 7 ] . acbd ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = ( v0 > > 24 ) . ps32 ( v1 > > 24 ) . pu16 ( ( v2 > > 24 ) . ps32 ( v3 > > 24 ) ) & mask ;
2010-04-25 00:31:27 +00:00
2013-06-17 04:11:10 +00:00
v4 = v0 . extract < 0 > ( ) ;
v5 = v0 . extract < 1 > ( ) ;
GSVector4i : : storel ( & d1 [ dstpitch * 0 ] , v4 ) ;
GSVector4i : : storel ( & d1 [ dstpitch * 1 ] , v5 ) ;
GSVector4i : : storeh ( & d1 [ dstpitch * 2 ] , v4 ) ;
GSVector4i : : storeh ( & d1 [ dstpitch * 3 ] , v5 ) ;
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
2013-06-17 04:11:10 +00:00
GSVector4i mask ( 0x0f0f0f0f ) ;
2009-02-09 21:15:56 +00:00
for ( int i = 0 ; i < 4 ; i + + )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 4 + 0 ] ;
v1 = s [ i * 4 + 1 ] ;
v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
2013-06-17 04:11:10 +00:00
v0 = ( ( v0 > > 24 ) . ps32 ( v1 > > 24 ) ) . pu16 ( ( v2 > > 24 ) . ps32 ( v3 > > 24 ) ) & mask ;
2009-02-09 21:15:56 +00:00
GSVector4i : : storel ( dst , v0 ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
GSVector4i : : storeh ( dst , v0 ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
}
2013-06-17 04:11:10 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
__forceinline static void ReadBlock4HHP ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
# if _M_SSE >= 0x501
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
uint8 * RESTRICT d0 = & dst [ dstpitch * 0 ] ;
uint8 * RESTRICT d1 = & dst [ dstpitch * 4 ] ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i v0 , v1 , v2 , v3 ;
GSVector4i v4 , v5 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = s [ 0 ] . acbd ( ) ;
v1 = s [ 1 ] . acbd ( ) ;
v2 = s [ 2 ] . acbd ( ) ;
v3 = s [ 3 ] . acbd ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = ( v0 > > 28 ) . ps32 ( v1 > > 28 ) . pu16 ( ( v2 > > 28 ) . ps32 ( v3 > > 28 ) ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = v0 . extract < 0 > ( ) ;
v5 = v0 . extract < 1 > ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i : : storel ( & d0 [ dstpitch * 0 ] , v4 ) ;
GSVector4i : : storel ( & d0 [ dstpitch * 1 ] , v5 ) ;
GSVector4i : : storeh ( & d0 [ dstpitch * 2 ] , v4 ) ;
GSVector4i : : storeh ( & d0 [ dstpitch * 3 ] , v5 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = s [ 4 ] . acbd ( ) ;
v1 = s [ 5 ] . acbd ( ) ;
v2 = s [ 6 ] . acbd ( ) ;
v3 = s [ 7 ] . acbd ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = ( v0 > > 28 ) . ps32 ( v1 > > 28 ) . pu16 ( ( v2 > > 28 ) . ps32 ( v3 > > 28 ) ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = v0 . extract < 0 > ( ) ;
v5 = v0 . extract < 1 > ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i : : storel ( & d1 [ dstpitch * 0 ] , v4 ) ;
GSVector4i : : storel ( & d1 [ dstpitch * 1 ] , v5 ) ;
GSVector4i : : storeh ( & d1 [ dstpitch * 2 ] , v4 ) ;
GSVector4i : : storeh ( & d1 [ dstpitch * 3 ] , v5 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# else
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i v0 , v1 , v2 , v3 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
for ( int i = 0 ; i < 4 ; i + + )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
v0 = s [ i * 4 + 0 ] ;
v1 = s [ i * 4 + 1 ] ;
v2 = s [ i * 4 + 2 ] ;
v3 = s [ i * 4 + 3 ] ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = ( ( v0 > > 28 ) . ps32 ( v1 > > 28 ) ) . pu16 ( ( v2 > > 28 ) . ps32 ( v3 > > 28 ) ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i : : storel ( dst , v0 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
dst + = dstpitch ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i : : storeh ( dst , v0 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
dst + = dstpitch ;
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
template < bool AEM , class V > __forceinline static V Expand24to32 ( const V & c , const V & TA0 )
{
return c | ( AEM ? TA0 . andnot ( c = = V : : zero ( ) ) : TA0 ) ; // TA0 & (c != GSVector4i::zero())
}
template < bool AEM , class V > __forceinline static V Expand16to32 ( const V & c , const V & TA0 , const V & TA1 )
{
return ( ( c & m_rxxx ) < < 3 ) | ( ( c & m_xgxx ) < < 6 ) | ( ( c & m_xxbx ) < < 9 ) | ( AEM ? TA0 . blend8 ( TA1 , c . sra16 ( 15 ) ) . andnot ( c = = V : : zero ( ) ) : TA0 . blend ( TA1 , c . sra16 ( 15 ) ) ) ;
}
2009-05-14 16:41:52 +00:00
template < bool AEM > static void ExpandBlock24 ( const uint32 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const GIFRegTEXA & TEXA )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector8i mask = GSVector8i : : x00ffffff ( ) ;
for ( int i = 0 ; i < 4 ; i + + , dst + = dstpitch * 2 )
{
GSVector8i v0 = s [ i * 2 + 0 ] & mask ;
GSVector8i v1 = s [ i * 2 + 1 ] & mask ;
GSVector8i * d0 = ( GSVector8i * ) & dst [ dstpitch * 0 ] ;
GSVector8i * d1 = ( GSVector8i * ) & dst [ dstpitch * 1 ] ;
d0 [ 0 ] = Expand24to32 < AEM > ( v0 , TA0 ) ;
d1 [ 0 ] = Expand24to32 < AEM > ( v1 , TA0 ) ;
}
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector4i mask = GSVector4i : : x00ffffff ( ) ;
for ( int i = 0 ; i < 4 ; i + + , dst + = dstpitch * 2 )
{
GSVector4i v0 = s [ i * 4 + 0 ] & mask ;
GSVector4i v1 = s [ i * 4 + 1 ] & mask ;
GSVector4i v2 = s [ i * 4 + 2 ] & mask ;
GSVector4i v3 = s [ i * 4 + 3 ] & mask ;
GSVector4i * d0 = ( GSVector4i * ) & dst [ dstpitch * 0 ] ;
GSVector4i * d1 = ( GSVector4i * ) & dst [ dstpitch * 1 ] ;
2013-06-10 15:56:11 +00:00
d0 [ 0 ] = Expand24to32 < AEM > ( v0 , TA0 ) ;
d0 [ 1 ] = Expand24to32 < AEM > ( v1 , TA0 ) ;
d1 [ 0 ] = Expand24to32 < AEM > ( v2 , TA0 ) ;
d1 [ 1 ] = Expand24to32 < AEM > ( v3 , TA0 ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2012-02-08 16:57:14 +00:00
template < bool AEM > static void ExpandBlock16 ( const uint16 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const GIFRegTEXA & TEXA ) // do not inline, uses too many xmm regs
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector8i TA1 ( TEXA . TA1 < < 24 ) ;
for ( int i = 0 ; i < 8 ; i + + , dst + = dstpitch )
{
GSVector8i v = s [ i ] . acbd ( ) ;
( ( GSVector8i * ) dst ) [ 0 ] = Expand16to32 < AEM > ( v . upl16 ( v ) , TA0 , TA1 ) ;
( ( GSVector8i * ) dst ) [ 1 ] = Expand16to32 < AEM > ( v . uph16 ( v ) , TA0 , TA1 ) ;
}
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector4i TA1 ( TEXA . TA1 < < 24 ) ;
2012-02-08 16:57:14 +00:00
for ( int i = 0 ; i < 8 ; i + + , dst + = dstpitch )
2009-02-09 21:15:56 +00:00
{
2012-02-08 16:57:14 +00:00
GSVector4i v0 = s [ i * 2 + 0 ] ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
( ( GSVector4i * ) dst ) [ 0 ] = Expand16to32 < AEM > ( v0 . upl16 ( v0 ) , TA0 , TA1 ) ;
( ( GSVector4i * ) dst ) [ 1 ] = Expand16to32 < AEM > ( v0 . uph16 ( v0 ) , TA0 , TA1 ) ;
2009-02-09 21:15:56 +00:00
2012-02-08 16:57:14 +00:00
GSVector4i v1 = s [ i * 2 + 1 ] ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
( ( GSVector4i * ) dst ) [ 2 ] = Expand16to32 < AEM > ( v1 . upl16 ( v1 ) , TA0 , TA1 ) ;
( ( GSVector4i * ) dst ) [ 3 ] = Expand16to32 < AEM > ( v1 . uph16 ( v1 ) , TA0 , TA1 ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock8_32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 16 ; j + + , dst + = dstpitch )
{
( ( const GSVector4i * ) src ) [ j ] . gather32_8 ( pal , ( GSVector4i * ) dst ) ;
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock8_16 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 16 ; j + + , dst + = dstpitch )
{
( ( const GSVector4i * ) src ) [ j ] . gather16_8 ( pal , ( GSVector4i * ) dst ) ;
}
}
2009-05-15 11:11:26 +00:00
__forceinline static void ExpandBlock4_32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint64 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 16 ; j + + , dst + = dstpitch )
{
( ( const GSVector4i * ) src ) [ j ] . gather64_8 ( pal , ( GSVector4i * ) dst ) ;
}
}
2009-05-15 11:11:26 +00:00
__forceinline static void ExpandBlock4_16 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint64 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 16 ; j + + , dst + = dstpitch )
{
( ( const GSVector4i * ) src ) [ j ] . gather32_8 ( pal , ( GSVector4i * ) dst ) ;
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock8H_32 ( uint32 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 8 ; j + + , dst + = dstpitch )
{
const GSVector4i * s = ( const GSVector4i * ) src ;
( ( GSVector4i * ) dst ) [ 0 ] = ( s [ j * 2 + 0 ] > > 24 ) . gather32_32 < > ( pal ) ;
( ( GSVector4i * ) dst ) [ 1 ] = ( s [ j * 2 + 1 ] > > 24 ) . gather32_32 < > ( pal ) ;
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock8H_16 ( uint32 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 8 ; j + + , dst + = dstpitch )
{
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 = ( s [ j * 2 + 0 ] > > 24 ) . gather32_32 < > ( pal ) ;
GSVector4i v1 = ( s [ j * 2 + 1 ] > > 24 ) . gather32_32 < > ( pal ) ;
( ( GSVector4i * ) dst ) [ 0 ] = v0 . pu32 ( v1 ) ;
# else
for ( int i = 0 ; i < 8 ; i + + )
{
2009-05-14 16:41:52 +00:00
( ( uint16 * ) dst ) [ i ] = ( uint16 ) pal [ src [ j * 8 + i ] > > 24 ] ;
2009-02-09 21:15:56 +00:00
}
# endif
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock4HL_32 ( uint32 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 8 ; j + + , dst + = dstpitch )
{
const GSVector4i * s = ( const GSVector4i * ) src ;
( ( GSVector4i * ) dst ) [ 0 ] = ( ( s [ j * 2 + 0 ] > > 24 ) & 0xf ) . gather32_32 < > ( pal ) ;
( ( GSVector4i * ) dst ) [ 1 ] = ( ( s [ j * 2 + 1 ] > > 24 ) & 0xf ) . gather32_32 < > ( pal ) ;
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock4HL_16 ( uint32 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 8 ; j + + , dst + = dstpitch )
{
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 = ( ( s [ j * 2 + 0 ] > > 24 ) & 0xf ) . gather32_32 < > ( pal ) ;
GSVector4i v1 = ( ( s [ j * 2 + 1 ] > > 24 ) & 0xf ) . gather32_32 < > ( pal ) ;
( ( GSVector4i * ) dst ) [ 0 ] = v0 . pu32 ( v1 ) ;
# else
for ( int i = 0 ; i < 8 ; i + + )
{
2009-05-14 16:41:52 +00:00
( ( uint16 * ) dst ) [ i ] = ( uint16 ) pal [ ( src [ j * 8 + i ] > > 24 ) & 0xf ] ;
2009-02-09 21:15:56 +00:00
}
# endif
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock4HH_32 ( uint32 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 8 ; j + + , dst + = dstpitch )
{
const GSVector4i * s = ( const GSVector4i * ) src ;
( ( GSVector4i * ) dst ) [ 0 ] = ( s [ j * 2 + 0 ] > > 28 ) . gather32_32 < > ( pal ) ;
( ( GSVector4i * ) dst ) [ 1 ] = ( s [ j * 2 + 1 ] > > 28 ) . gather32_32 < > ( pal ) ;
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void ExpandBlock4HH_16 ( uint32 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
for ( int j = 0 ; j < 8 ; j + + , dst + = dstpitch )
{
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 = ( s [ j * 2 + 0 ] > > 28 ) . gather32_32 < > ( pal ) ;
GSVector4i v1 = ( s [ j * 2 + 1 ] > > 28 ) . gather32_32 < > ( pal ) ;
( ( GSVector4i * ) dst ) [ 0 ] = v0 . pu32 ( v1 ) ;
# else
for ( int i = 0 ; i < 8 ; i + + )
{
2009-05-14 16:41:52 +00:00
( ( uint16 * ) dst ) [ i ] = ( uint16 ) pal [ src [ j * 8 + i ] > > 28 ] ;
2009-02-09 21:15:56 +00:00
}
# endif
}
}
2009-05-14 16:41:52 +00:00
__forceinline static void UnpackAndWriteBlock24 ( const uint8 * RESTRICT src , int srcpitch , uint8 * RESTRICT dst )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
# if _M_SSE >= 0x501
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
const uint8 * RESTRICT s0 = & src [ srcpitch * 0 ] ;
const uint8 * RESTRICT s1 = & src [ srcpitch * 1 ] ;
const uint8 * RESTRICT s2 = & src [ srcpitch * 2 ] ;
const uint8 * RESTRICT s3 = & src [ srcpitch * 3 ] ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector8i v0 , v1 , v2 , v3 , v4 , v5 , v6 ;
GSVector8i mask = GSVector8i : : x00ffffff ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = GSVector8i : : load ( s0 , s0 + 8 , s2 , s2 + 8 ) ;
v5 = GSVector8i : : load ( s0 + 16 , s1 , s2 + 16 , s3 ) ;
v6 = GSVector8i : : load ( s1 + 8 , s1 + 16 , s3 + 8 , s3 + 16 ) ;
v0 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
v4 = v4 . srl < 12 > ( v5 ) ;
v1 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
v4 = v5 . srl < 8 > ( v6 ) ;
v2 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
v4 = v6 . srl < 4 > ( ) ;
v3 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector8i : : sw64 ( v0 , v2 , v1 , v3 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
( ( GSVector8i * ) dst ) [ 0 ] = ( ( GSVector8i * ) dst ) [ 0 ] . blend8 ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 1 ] = ( ( GSVector8i * ) dst ) [ 1 ] . blend8 ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 2 ] = ( ( GSVector8i * ) dst ) [ 2 ] . blend8 ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 3 ] = ( ( GSVector8i * ) dst ) [ 3 ] . blend8 ( v3 , mask ) ;
src + = srcpitch * 4 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
s0 = & src [ srcpitch * 0 ] ;
s1 = & src [ srcpitch * 1 ] ;
s2 = & src [ srcpitch * 2 ] ;
s3 = & src [ srcpitch * 3 ] ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = GSVector8i : : load ( s0 , s0 + 8 , s2 , s2 + 8 ) ;
v5 = GSVector8i : : load ( s0 + 16 , s1 , s2 + 16 , s3 ) ;
v6 = GSVector8i : : load ( s1 + 8 , s1 + 16 , s3 + 8 , s3 + 16 ) ;
v0 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
v4 = v4 . srl < 12 > ( v5 ) ;
v1 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
v4 = v5 . srl < 8 > ( v6 ) ;
v2 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
v4 = v6 . srl < 4 > ( ) ;
v3 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) . acbd ( ) ;
GSVector8i : : sw64 ( v0 , v2 , v1 , v3 ) ;
( ( GSVector8i * ) dst ) [ 4 ] = ( ( GSVector8i * ) dst ) [ 4 ] . blend8 ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 5 ] = ( ( GSVector8i * ) dst ) [ 5 ] . blend8 ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 6 ] = ( ( GSVector8i * ) dst ) [ 6 ] . blend8 ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 7 ] = ( ( GSVector8i * ) dst ) [ 7 ] . blend8 ( v3 , mask ) ;
# else
GSVector4i v0 , v1 , v2 , v3 , v4 , v5 , v6 ;
GSVector4i mask = GSVector4i : : x00ffffff ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
for ( int i = 0 ; i < 4 ; i + + , src + = srcpitch * 2 )
{
v4 = GSVector4i : : load < false > ( src ) ;
v5 = GSVector4i : : load ( src + 16 , src + srcpitch ) ;
v6 = GSVector4i : : load < false > ( src + srcpitch + 8 ) ;
v0 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) ;
v4 = v4 . srl < 12 > ( v5 ) ;
v1 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) ;
v4 = v5 . srl < 8 > ( v6 ) ;
v2 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) ;
v4 = v6 . srl < 4 > ( ) ;
v3 = v4 . upl32 ( v4 . srl < 3 > ( ) ) . upl64 ( v4 . srl < 6 > ( ) . upl32 ( v4 . srl < 9 > ( ) ) ) ;
2009-02-09 21:15:56 +00:00
GSVector4i : : sw64 ( v0 , v2 , v1 , v3 ) ;
2010-04-25 00:31:27 +00:00
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 0 ] . blend8 ( v0 , mask ) ;
2009-02-09 21:15:56 +00:00
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 1 ] . blend8 ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 2 ] . blend8 ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 3 ] . blend8 ( v3 , mask ) ;
}
2013-06-17 04:11:10 +00:00
# endif
2009-02-09 21:15:56 +00:00
}
2009-05-14 16:41:52 +00:00
__forceinline static void UnpackAndWriteBlock8H ( const uint8 * RESTRICT src , int srcpitch , uint8 * RESTRICT dst )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
GSVector4i v4 , v5 , v6 , v7 ;
# if _M_SSE >= 0x501
GSVector8i v0 , v1 , v2 , v3 ;
GSVector8i mask = GSVector8i : : xff000000 ( ) ;
v4 = GSVector4i : : loadl ( & src [ srcpitch * 0 ] ) ;
v5 = GSVector4i : : loadl ( & src [ srcpitch * 1 ] ) ;
v6 = GSVector4i : : loadl ( & src [ srcpitch * 2 ] ) ;
v7 = GSVector4i : : loadl ( & src [ srcpitch * 3 ] ) ;
v2 = GSVector8i : : cast ( v4 . upl16 ( v5 ) ) ;
v3 = GSVector8i : : cast ( v6 . upl16 ( v7 ) ) ;
v0 = v2 . u8to32c ( ) < < 24 ;
v1 = v2 . bbbb ( ) . u8to32c ( ) < < 24 ;
v2 = v3 . u8to32c ( ) < < 24 ;
v3 = v3 . bbbb ( ) . u8to32c ( ) < < 24 ;
( ( GSVector8i * ) dst ) [ 0 ] = ( ( GSVector8i * ) dst ) [ 0 ] . blend8 ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 1 ] = ( ( GSVector8i * ) dst ) [ 1 ] . blend8 ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 2 ] = ( ( GSVector8i * ) dst ) [ 2 ] . blend8 ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 3 ] = ( ( GSVector8i * ) dst ) [ 3 ] . blend8 ( v3 , mask ) ;
src + = srcpitch * 4 ;
v4 = GSVector4i : : loadl ( & src [ srcpitch * 0 ] ) ;
v5 = GSVector4i : : loadl ( & src [ srcpitch * 1 ] ) ;
v6 = GSVector4i : : loadl ( & src [ srcpitch * 2 ] ) ;
v7 = GSVector4i : : loadl ( & src [ srcpitch * 3 ] ) ;
v2 = GSVector8i : : cast ( v4 . upl16 ( v5 ) ) ;
v3 = GSVector8i : : cast ( v6 . upl16 ( v7 ) ) ;
v0 = v2 . u8to32c ( ) < < 24 ;
v1 = v2 . bbbb ( ) . u8to32c ( ) < < 24 ;
v2 = v3 . u8to32c ( ) < < 24 ;
v3 = v3 . bbbb ( ) . u8to32c ( ) < < 24 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
( ( GSVector8i * ) dst ) [ 4 ] = ( ( GSVector8i * ) dst ) [ 4 ] . blend8 ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 5 ] = ( ( GSVector8i * ) dst ) [ 5 ] . blend8 ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 6 ] = ( ( GSVector8i * ) dst ) [ 6 ] . blend8 ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 7 ] = ( ( GSVector8i * ) dst ) [ 7 ] . blend8 ( v3 , mask ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# elif _M_SSE >= 0x301
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = GSVector4i : : xff000000 ( ) ;
2009-02-09 21:15:56 +00:00
GSVector4i mask0 = m_uw8hmask0 ;
GSVector4i mask1 = m_uw8hmask1 ;
GSVector4i mask2 = m_uw8hmask2 ;
GSVector4i mask3 = m_uw8hmask3 ;
for ( int i = 0 ; i < 4 ; i + + , src + = srcpitch * 2 )
{
2013-06-17 04:11:10 +00:00
v4 = GSVector4i : : load ( src , src + srcpitch ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = v4 . shuffle8 ( mask0 ) ;
v1 = v4 . shuffle8 ( mask1 ) ;
v2 = v4 . shuffle8 ( mask2 ) ;
v3 = v4 . shuffle8 ( mask3 ) ;
2009-02-09 21:15:56 +00:00
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 0 ] . blend8 ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 1 ] . blend8 ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 2 ] . blend8 ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 3 ] . blend8 ( v3 , mask ) ;
}
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = GSVector4i : : xff000000 ( ) ;
2009-02-09 21:15:56 +00:00
for ( int i = 0 ; i < 4 ; i + + , src + = srcpitch * 2 )
{
2013-06-17 04:11:10 +00:00
v4 = GSVector4i : : loadl ( & src [ srcpitch * 0 ] ) ;
v5 = GSVector4i : : loadl ( & src [ srcpitch * 1 ] ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v6 = v4 . upl16 ( v5 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = v6 . upl8 ( v6 ) ;
v5 = v6 . uph8 ( v6 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v0 = v4 . upl16 ( v4 ) ;
v1 = v4 . uph16 ( v4 ) ;
v2 = v5 . upl16 ( v5 ) ;
v3 = v5 . uph16 ( v5 ) ;
2009-02-09 21:15:56 +00:00
( ( GSVector4i * ) dst ) [ i * 4 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 0 ] . blend8 ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 1 ] . blend8 ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 2 ] . blend8 ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 4 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 4 + 3 ] . blend8 ( v3 , mask ) ;
}
# endif
}
2009-05-14 16:41:52 +00:00
__forceinline static void UnpackAndWriteBlock4HL ( const uint8 * RESTRICT src , int srcpitch , uint8 * RESTRICT dst )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("4HL\n");
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
if ( 0 )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
uint8 * s = ( uint8 * ) src ;
for ( int j = 0 ; j < 8 ; j + + , s + = srcpitch )
for ( int i = 0 ; i < 4 ; i + + ) s [ i ] = ( columnTable32 [ j ] [ i * 2 ] & 0x0f ) | ( columnTable32 [ j ] [ i * 2 + 1 ] < < 4 ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-17 04:11:10 +00:00
GSVector4i v4 , v5 , v6 , v7 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# if _M_SSE >= 0x501
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector8i v0 , v1 , v2 , v3 ;
GSVector8i mask ( 0x0f000000 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v6 = GSVector4i ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = v6 . upl8 ( v6 > > 4 ) ;
v5 = v6 . uph8 ( v6 > > 4 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v2 = GSVector8i : : cast ( v4 . upl16 ( v5 ) ) ;
v3 = GSVector8i : : cast ( v4 . uph16 ( v5 ) ) ;
v0 = v2 . u8to32c ( ) < < 24 ;
v1 = v2 . bbbb ( ) . u8to32c ( ) < < 24 ;
v2 = v3 . u8to32c ( ) < < 24 ;
v3 = v3 . bbbb ( ) . u8to32c ( ) < < 24 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
( ( GSVector8i * ) dst ) [ 0 ] = ( ( GSVector8i * ) dst ) [ 0 ] . blend ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 1 ] = ( ( GSVector8i * ) dst ) [ 1 ] . blend ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 2 ] = ( ( GSVector8i * ) dst ) [ 2 ] . blend ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 3 ] = ( ( GSVector8i * ) dst ) [ 3 ] . blend ( v3 , mask ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
src + = srcpitch * 4 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v6 = GSVector4i ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = v6 . upl8 ( v6 > > 4 ) ;
v5 = v6 . uph8 ( v6 > > 4 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v2 = GSVector8i : : cast ( v4 . upl16 ( v5 ) ) ;
v3 = GSVector8i : : cast ( v4 . uph16 ( v5 ) ) ;
v0 = v2 . u8to32c ( ) < < 24 ;
v1 = v2 . bbbb ( ) . u8to32c ( ) < < 24 ;
v2 = v3 . u8to32c ( ) < < 24 ;
v3 = v3 . bbbb ( ) . u8to32c ( ) < < 24 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
( ( GSVector8i * ) dst ) [ 4 ] = ( ( GSVector8i * ) dst ) [ 4 ] . blend ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 5 ] = ( ( GSVector8i * ) dst ) [ 5 ] . blend ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 6 ] = ( ( GSVector8i * ) dst ) [ 6 ] . blend ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 7 ] = ( ( GSVector8i * ) dst ) [ 7 ] . blend ( v3 , mask ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# elif _M_SSE >= 0x301
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = GSVector4i ( 0x0f000000 ) ;
2009-02-09 21:15:56 +00:00
GSVector4i mask0 = m_uw8hmask0 ;
GSVector4i mask1 = m_uw8hmask1 ;
GSVector4i mask2 = m_uw8hmask2 ;
GSVector4i mask3 = m_uw8hmask3 ;
for ( int i = 0 ; i < 2 ; i + + , src + = srcpitch * 4 )
{
2013-06-17 04:11:10 +00:00
GSVector4i v ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
v4 = v . upl8 ( v > > 4 ) ;
v5 = v . uph8 ( v > > 4 ) ;
v0 = v4 . shuffle8 ( mask0 ) ;
v1 = v4 . shuffle8 ( mask1 ) ;
v2 = v4 . shuffle8 ( mask2 ) ;
v3 = v4 . shuffle8 ( mask3 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 0 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 1 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 2 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 3 ] . blend ( v3 , mask ) ;
v0 = v5 . shuffle8 ( mask0 ) ;
v1 = v5 . shuffle8 ( mask1 ) ;
v2 = v5 . shuffle8 ( mask2 ) ;
v3 = v5 . shuffle8 ( mask3 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 4 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 4 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 5 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 5 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 6 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 6 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 7 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 7 ] . blend ( v3 , mask ) ;
}
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# else
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = GSVector4i ( 0x0f000000 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
for ( int i = 0 ; i < 2 ; i + + , src + = srcpitch * 4 )
{
GSVector4i v ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
v4 = v . upl8 ( v > > 4 ) ;
v5 = v . uph8 ( v > > 4 ) ;
v6 = v4 . upl16 ( v5 ) ;
v7 = v4 . uph16 ( v5 ) ;
v4 = v6 . upl8 ( v6 ) ;
v5 = v6 . uph8 ( v6 ) ;
v6 = v7 . upl8 ( v7 ) ;
v7 = v7 . uph8 ( v7 ) ;
v0 = v4 . upl16 ( v4 ) ;
v1 = v4 . uph16 ( v4 ) ;
v2 = v5 . upl16 ( v5 ) ;
v3 = v5 . uph16 ( v5 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 0 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 1 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 2 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 3 ] . blend ( v3 , mask ) ;
v0 = v6 . upl16 ( v6 ) ;
v1 = v6 . uph16 ( v6 ) ;
v2 = v7 . upl16 ( v7 ) ;
v3 = v7 . uph16 ( v7 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 4 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 4 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 5 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 5 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 6 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 6 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 7 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 7 ] . blend ( v3 , mask ) ;
}
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# endif
}
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
__forceinline static void UnpackAndWriteBlock4HH ( const uint8 * RESTRICT src , int srcpitch , uint8 * RESTRICT dst )
{
GSVector4i v4 , v5 , v6 , v7 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# if _M_SSE >= 0x501
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector8i v0 , v1 , v2 , v3 ;
GSVector8i mask = GSVector8i : : xf0000000 ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v6 = GSVector4i ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = ( v6 < < 4 ) . upl8 ( v6 ) ;
v5 = ( v6 < < 4 ) . uph8 ( v6 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v2 = GSVector8i : : cast ( v4 . upl16 ( v5 ) ) ;
v3 = GSVector8i : : cast ( v4 . uph16 ( v5 ) ) ;
v0 = v2 . u8to32c ( ) < < 24 ;
v1 = v2 . bbbb ( ) . u8to32c ( ) < < 24 ;
v2 = v3 . u8to32c ( ) < < 24 ;
v3 = v3 . bbbb ( ) . u8to32c ( ) < < 24 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
( ( GSVector8i * ) dst ) [ 0 ] = ( ( GSVector8i * ) dst ) [ 0 ] . blend ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 1 ] = ( ( GSVector8i * ) dst ) [ 1 ] . blend ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 2 ] = ( ( GSVector8i * ) dst ) [ 2 ] . blend ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 3 ] = ( ( GSVector8i * ) dst ) [ 3 ] . blend ( v3 , mask ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
src + = srcpitch * 4 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v6 = GSVector4i ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v4 = ( v6 < < 4 ) . upl8 ( v6 ) ;
v5 = ( v6 < < 4 ) . uph8 ( v6 ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
v2 = GSVector8i : : cast ( v4 . upl16 ( v5 ) ) ;
v3 = GSVector8i : : cast ( v4 . uph16 ( v5 ) ) ;
v0 = v2 . u8to32c ( ) < < 24 ;
v1 = v2 . bbbb ( ) . u8to32c ( ) < < 24 ;
v2 = v3 . u8to32c ( ) < < 24 ;
v3 = v3 . bbbb ( ) . u8to32c ( ) < < 24 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
( ( GSVector8i * ) dst ) [ 4 ] = ( ( GSVector8i * ) dst ) [ 4 ] . blend ( v0 , mask ) ;
( ( GSVector8i * ) dst ) [ 5 ] = ( ( GSVector8i * ) dst ) [ 5 ] . blend ( v1 , mask ) ;
( ( GSVector8i * ) dst ) [ 6 ] = ( ( GSVector8i * ) dst ) [ 6 ] . blend ( v2 , mask ) ;
( ( GSVector8i * ) dst ) [ 7 ] = ( ( GSVector8i * ) dst ) [ 7 ] . blend ( v3 , mask ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# elif _M_SSE >= 0x301
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = GSVector4i : : xf0000000 ( ) ;
GSVector4i mask0 = m_uw8hmask0 ;
GSVector4i mask1 = m_uw8hmask1 ;
GSVector4i mask2 = m_uw8hmask2 ;
GSVector4i mask3 = m_uw8hmask3 ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
for ( int i = 0 ; i < 2 ; i + + , src + = srcpitch * 4 )
{
GSVector4i v ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
v4 = ( v < < 4 ) . upl8 ( v ) ;
v5 = ( v < < 4 ) . uph8 ( v ) ;
v0 = v4 . shuffle8 ( mask0 ) ;
v1 = v4 . shuffle8 ( mask1 ) ;
v2 = v4 . shuffle8 ( mask2 ) ;
v3 = v4 . shuffle8 ( mask3 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 0 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 1 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 2 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 3 ] . blend ( v3 , mask ) ;
v0 = v5 . shuffle8 ( mask0 ) ;
v1 = v5 . shuffle8 ( mask1 ) ;
v2 = v5 . shuffle8 ( mask2 ) ;
v3 = v5 . shuffle8 ( mask3 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 4 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 4 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 5 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 5 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 6 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 6 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 7 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 7 ] . blend ( v3 , mask ) ;
}
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
# else
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = GSVector4i : : xf0000000 ( ) ;
2009-02-09 21:15:56 +00:00
2013-06-17 04:11:10 +00:00
for ( int i = 0 ; i < 2 ; i + + , src + = srcpitch * 4 )
{
GSVector4i v ( * ( uint32 * ) & src [ srcpitch * 0 ] , * ( uint32 * ) & src [ srcpitch * 2 ] , * ( uint32 * ) & src [ srcpitch * 1 ] , * ( uint32 * ) & src [ srcpitch * 3 ] ) ;
v4 = ( v < < 4 ) . upl8 ( v ) ;
v5 = ( v < < 4 ) . uph8 ( v ) ;
v6 = v4 . upl16 ( v5 ) ;
v7 = v4 . uph16 ( v5 ) ;
v4 = v6 . upl8 ( v6 ) ;
v5 = v6 . uph8 ( v6 ) ;
v6 = v7 . upl8 ( v7 ) ;
v7 = v7 . uph8 ( v7 ) ;
v0 = v4 . upl16 ( v4 ) ;
v1 = v4 . uph16 ( v4 ) ;
v2 = v5 . upl16 ( v5 ) ;
v3 = v5 . uph16 ( v5 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 0 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 0 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 1 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 1 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 2 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 2 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 3 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 3 ] . blend ( v3 , mask ) ;
v0 = v6 . upl16 ( v6 ) ;
v1 = v6 . uph16 ( v6 ) ;
v2 = v7 . upl16 ( v7 ) ;
v3 = v7 . uph16 ( v7 ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 4 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 4 ] . blend ( v0 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 5 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 5 ] . blend ( v1 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 6 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 6 ] . blend ( v2 , mask ) ;
( ( GSVector4i * ) dst ) [ i * 8 + 7 ] = ( ( GSVector4i * ) dst ) [ i * 8 + 7 ] . blend ( v3 , mask ) ;
2009-02-09 21:15:56 +00:00
}
# endif
}
2009-05-14 16:41:52 +00:00
template < bool AEM > __forceinline static void ReadAndExpandBlock24 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const GIFRegTEXA & TEXA )
2009-02-09 21:15:56 +00:00
{
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector8i mask = GSVector8i : : x00ffffff ( ) ;
2013-06-20 05:07:52 +00:00
GSVector8i v0 , v1 , v2 , v3 ;
2013-06-10 15:56:11 +00:00
2013-06-20 05:07:52 +00:00
v0 = s [ 0 ] & mask ;
v1 = s [ 1 ] & mask ;
v2 = s [ 2 ] & mask ;
v3 = s [ 3 ] & mask ;
2013-06-10 15:56:11 +00:00
2013-06-20 05:07:52 +00:00
GSVector8i : : sw128 ( v0 , v1 ) ;
GSVector8i : : sw64 ( v0 , v1 ) ;
GSVector8i : : sw128 ( v2 , v3 ) ;
GSVector8i : : sw64 ( v2 , v3 ) ;
2013-06-10 15:56:11 +00:00
2013-06-20 05:07:52 +00:00
* ( GSVector8i * ) & dst [ dstpitch * 0 ] = Expand24to32 < AEM > ( v0 , TA0 ) ;
* ( GSVector8i * ) & dst [ dstpitch * 1 ] = Expand24to32 < AEM > ( v1 , TA0 ) ;
* ( GSVector8i * ) & dst [ dstpitch * 2 ] = Expand24to32 < AEM > ( v2 , TA0 ) ;
* ( GSVector8i * ) & dst [ dstpitch * 3 ] = Expand24to32 < AEM > ( v3 , TA0 ) ;
2013-06-10 15:56:11 +00:00
2013-06-20 05:07:52 +00:00
v0 = s [ 4 ] & mask ;
v1 = s [ 5 ] & mask ;
v2 = s [ 6 ] & mask ;
v3 = s [ 7 ] & mask ;
GSVector8i : : sw128 ( v0 , v1 ) ;
GSVector8i : : sw64 ( v0 , v1 ) ;
GSVector8i : : sw128 ( v2 , v3 ) ;
GSVector8i : : sw64 ( v2 , v3 ) ;
dst + = dstpitch * 4 ;
* ( GSVector8i * ) & dst [ dstpitch * 0 ] = Expand24to32 < AEM > ( v0 , TA0 ) ;
* ( GSVector8i * ) & dst [ dstpitch * 1 ] = Expand24to32 < AEM > ( v1 , TA0 ) ;
* ( GSVector8i * ) & dst [ dstpitch * 2 ] = Expand24to32 < AEM > ( v2 , TA0 ) ;
* ( GSVector8i * ) & dst [ dstpitch * 3 ] = Expand24to32 < AEM > ( v3 , TA0 ) ;
2013-06-10 15:56:11 +00:00
# else
2009-02-09 21:15:56 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector4i mask = GSVector4i : : x00ffffff ( ) ;
for ( int i = 0 ; i < 4 ; i + + , dst + = dstpitch * 2 )
{
GSVector4i v0 = s [ i * 4 + 0 ] ;
GSVector4i v1 = s [ i * 4 + 1 ] ;
GSVector4i v2 = s [ i * 4 + 2 ] ;
GSVector4i v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
v0 & = mask ;
v1 & = mask ;
v2 & = mask ;
v3 & = mask ;
GSVector4i * d0 = ( GSVector4i * ) & dst [ dstpitch * 0 ] ;
GSVector4i * d1 = ( GSVector4i * ) & dst [ dstpitch * 1 ] ;
2013-06-10 15:56:11 +00:00
d0 [ 0 ] = Expand24to32 < AEM > ( v0 , TA0 ) ;
d0 [ 1 ] = Expand24to32 < AEM > ( v1 , TA0 ) ;
d1 [ 0 ] = Expand24to32 < AEM > ( v2 , TA0 ) ;
d1 [ 1 ] = Expand24to32 < AEM > ( v3 , TA0 ) ;
2009-02-09 21:15:56 +00:00
}
2013-06-10 15:56:11 +00:00
# endif
2012-02-08 16:57:14 +00:00
}
template < bool AEM > __forceinline static void ReadAndExpandBlock16 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const GIFRegTEXA & TEXA )
{
2013-06-10 15:56:11 +00:00
# if _M_SSE >= 0x501
const GSVector8i * s = ( const GSVector8i * ) src ;
GSVector8i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector8i TA1 ( TEXA . TA1 < < 24 ) ;
for ( int i = 0 ; i < 4 ; i + + , dst + = dstpitch * 2 )
{
GSVector8i v0 = s [ i * 2 + 0 ] . shuffle8 ( m_r16mask ) ;
GSVector8i v1 = s [ i * 2 + 1 ] . shuffle8 ( m_r16mask ) ;
GSVector8i : : sw128 ( v0 , v1 ) ;
GSVector8i : : sw32 ( v0 , v1 ) ;
GSVector8i * d0 = ( GSVector8i * ) & dst [ dstpitch * 0 ] ;
GSVector8i * d1 = ( GSVector8i * ) & dst [ dstpitch * 1 ] ;
d0 [ 0 ] = Expand16to32 < AEM > ( v0 . upl16 ( v0 ) , TA0 , TA1 ) ;
d0 [ 1 ] = Expand16to32 < AEM > ( v0 . uph16 ( v0 ) , TA0 , TA1 ) ;
d1 [ 0 ] = Expand16to32 < AEM > ( v1 . upl16 ( v1 ) , TA0 , TA1 ) ;
d1 [ 1 ] = Expand16to32 < AEM > ( v1 . uph16 ( v1 ) , TA0 , TA1 ) ;
}
2013-07-01 21:28:58 +00:00
2013-06-10 15:56:11 +00:00
# elif 0 // not faster
2012-02-08 16:57:14 +00:00
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i TA0 ( TEXA . TA0 < < 24 ) ;
GSVector4i TA1 ( TEXA . TA1 < < 24 ) ;
for ( int i = 0 ; i < 4 ; i + + , dst + = dstpitch * 2 )
{
GSVector4i v0 = s [ i * 4 + 0 ] ;
GSVector4i v1 = s [ i * 4 + 1 ] ;
GSVector4i v2 = s [ i * 4 + 2 ] ;
GSVector4i v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw32 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw16 ( v0 , v2 , v1 , v3 ) ;
GSVector4i * d0 = ( GSVector4i * ) & dst [ dstpitch * 0 ] ;
d0 [ 0 ] = Expand16to32 < AEM > ( v0 . upl16 ( v0 ) , TA0 , TA1 ) ;
d0 [ 1 ] = Expand16to32 < AEM > ( v0 . uph16 ( v0 ) , TA0 , TA1 ) ;
d0 [ 2 ] = Expand16to32 < AEM > ( v1 . upl16 ( v1 ) , TA0 , TA1 ) ;
d0 [ 3 ] = Expand16to32 < AEM > ( v1 . uph16 ( v1 ) , TA0 , TA1 ) ;
GSVector4i * d1 = ( GSVector4i * ) & dst [ dstpitch * 1 ] ;
d1 [ 0 ] = Expand16to32 < AEM > ( v2 . upl16 ( v2 ) , TA0 , TA1 ) ;
d1 [ 1 ] = Expand16to32 < AEM > ( v2 . uph16 ( v2 ) , TA0 , TA1 ) ;
d1 [ 2 ] = Expand16to32 < AEM > ( v3 . upl16 ( v3 ) , TA0 , TA1 ) ;
d1 [ 3 ] = Expand16to32 < AEM > ( v3 . uph16 ( v3 ) , TA0 , TA1 ) ;
}
# else
2016-05-11 23:24:37 +00:00
alignas ( 32 ) uint16 block [ 16 * 8 ] ;
2012-02-08 16:57:14 +00:00
2013-06-10 15:56:11 +00:00
ReadBlock16 ( src , ( uint8 * ) block , sizeof ( block ) / 8 ) ;
2012-02-08 16:57:14 +00:00
ExpandBlock16 < AEM > ( block , dst , dstpitch , TEXA ) ;
# endif
}
2009-02-09 21:15:56 +00:00
2009-05-14 16:41:52 +00:00
__forceinline static void ReadAndExpandBlock8_32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("ReadAndExpandBlock8_32\n");
2009-02-09 21:15:56 +00:00
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = m_r8mask ;
for ( int i = 0 ; i < 2 ; i + + )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 8 + 0 ] . shuffle8 ( mask ) ;
v1 = s [ i * 8 + 1 ] . shuffle8 ( mask ) ;
v2 = s [ i * 8 + 2 ] . shuffle8 ( mask ) ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 8 + 3 ] . shuffle8 ( mask ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw32 ( v0 , v1 , v3 , v2 ) ;
v0 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v3 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v1 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v2 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
2010-04-25 00:31:27 +00:00
v2 = s [ i * 8 + 4 ] . shuffle8 ( mask ) ;
v3 = s [ i * 8 + 5 ] . shuffle8 ( mask ) ;
v0 = s [ i * 8 + 6 ] . shuffle8 ( mask ) ;
2009-02-09 21:15:56 +00:00
v1 = s [ i * 8 + 7 ] . shuffle8 ( mask ) ;
GSVector4i : : sw16 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw32 ( v0 , v1 , v3 , v2 ) ;
v0 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v3 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v1 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v2 . gather32_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
}
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
2016-05-11 23:24:37 +00:00
alignas ( 32 ) uint8 block [ 16 * 16 ] ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
ReadBlock8 ( src , ( uint8 * ) block , sizeof ( block ) / 16 ) ;
2009-02-09 21:15:56 +00:00
ExpandBlock8_32 ( block , dst , dstpitch , pal ) ;
# endif
}
// TODO: ReadAndExpandBlock8_16
2009-05-15 11:11:26 +00:00
__forceinline static void ReadAndExpandBlock4_32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint64 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("ReadAndExpandBlock4_32\n");
2009-02-09 21:15:56 +00:00
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
GSVector4i mask = m_r4mask ;
for ( int i = 0 ; i < 2 ; i + + )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 8 + 0 ] . xzyw ( ) ;
v1 = s [ i * 8 + 1 ] . xzyw ( ) ;
v2 = s [ i * 8 + 2 ] . xzyw ( ) ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 8 + 3 ] . xzyw ( ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw4 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
v0 = v0 . shuffle8 ( mask ) ;
v1 = v1 . shuffle8 ( mask ) ;
v2 = v2 . shuffle8 ( mask ) ;
v3 = v3 . shuffle8 ( mask ) ;
GSVector4i : : sw16rh ( v0 , v1 , v2 , v3 ) ;
v0 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v1 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v2 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v3 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
2010-04-25 00:31:27 +00:00
v0 = s [ i * 8 + 4 ] . xzyw ( ) ;
v1 = s [ i * 8 + 5 ] . xzyw ( ) ;
v2 = s [ i * 8 + 6 ] . xzyw ( ) ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 8 + 7 ] . xzyw ( ) ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
GSVector4i : : sw4 ( v0 , v2 , v1 , v3 ) ;
GSVector4i : : sw8 ( v0 , v1 , v2 , v3 ) ;
v0 = v0 . shuffle8 ( mask ) ;
v1 = v1 . shuffle8 ( mask ) ;
v2 = v2 . shuffle8 ( mask ) ;
v3 = v3 . shuffle8 ( mask ) ;
GSVector4i : : sw16rl ( v0 , v1 , v2 , v3 ) ;
v0 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v1 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v2 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
v3 . gather64_8 < > ( pal , ( GSVector4i * ) dst ) ;
dst + = dstpitch ;
}
# else
2016-05-11 23:24:37 +00:00
alignas ( 32 ) uint8 block [ ( 32 / 2 ) * 16 ] ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
ReadBlock4 ( src , ( uint8 * ) block , sizeof ( block ) / 16 ) ;
2009-02-09 21:15:56 +00:00
2009-05-14 16:41:52 +00:00
ExpandBlock4_32 ( block , dst , dstpitch , pal ) ;
2009-02-09 21:15:56 +00:00
# endif
}
// TODO: ReadAndExpandBlock4_16
2009-05-14 16:41:52 +00:00
__forceinline static void ReadAndExpandBlock8H_32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("ReadAndExpandBlock8H_32\n");
2009-02-09 21:15:56 +00:00
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
for ( int i = 0 ; i < 4 ; i + + )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 4 + 0 ] ;
v1 = s [ i * 4 + 1 ] ;
v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
( v0 > > 24 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 0 ] ) ;
( v1 > > 24 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 16 ] ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
( v2 > > 24 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 0 ] ) ;
( v3 > > 24 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 16 ] ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
}
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
2016-05-11 23:24:37 +00:00
alignas ( 32 ) uint32 block [ 8 * 8 ] ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
ReadBlock32 ( src , ( uint8 * ) block , sizeof ( block ) / 8 ) ;
2009-02-09 21:15:56 +00:00
ExpandBlock8H_32 ( block , dst , dstpitch , pal ) ;
# endif
}
// TODO: ReadAndExpandBlock8H_16
2009-05-14 16:41:52 +00:00
__forceinline static void ReadAndExpandBlock4HL_32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("ReadAndExpandBlock4HL_32\n");
2009-02-09 21:15:56 +00:00
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
for ( int i = 0 ; i < 4 ; i + + )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 4 + 0 ] ;
v1 = s [ i * 4 + 1 ] ;
v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
( ( v0 > > 24 ) & 0xf ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 0 ] ) ;
( ( v1 > > 24 ) & 0xf ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 16 ] ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
( ( v2 > > 24 ) & 0xf ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 0 ] ) ;
( ( v3 > > 24 ) & 0xf ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 16 ] ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
}
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
2016-05-11 23:24:37 +00:00
alignas ( 32 ) uint32 block [ 8 * 8 ] ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
ReadBlock32 ( src , ( uint8 * ) block , sizeof ( block ) / 8 ) ;
2009-02-09 21:15:56 +00:00
ExpandBlock4HL_32 ( block , dst , dstpitch , pal ) ;
# endif
}
// TODO: ReadAndExpandBlock4HL_16
2010-04-25 00:31:27 +00:00
2009-05-14 16:41:52 +00:00
__forceinline static void ReadAndExpandBlock4HH_32 ( const uint8 * RESTRICT src , uint8 * RESTRICT dst , int dstpitch , const uint32 * RESTRICT pal )
2009-02-09 21:15:56 +00:00
{
2013-06-17 04:11:10 +00:00
//printf("ReadAndExpandBlock4HH_32\n");
2009-02-09 21:15:56 +00:00
# if _M_SSE >= 0x401
const GSVector4i * s = ( const GSVector4i * ) src ;
GSVector4i v0 , v1 , v2 , v3 ;
for ( int i = 0 ; i < 4 ; i + + )
{
2010-04-25 00:31:27 +00:00
v0 = s [ i * 4 + 0 ] ;
v1 = s [ i * 4 + 1 ] ;
v2 = s [ i * 4 + 2 ] ;
2009-02-09 21:15:56 +00:00
v3 = s [ i * 4 + 3 ] ;
GSVector4i : : sw64 ( v0 , v1 , v2 , v3 ) ;
( v0 > > 28 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 0 ] ) ;
( v1 > > 28 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 16 ] ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
( v2 > > 28 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 0 ] ) ;
( v3 > > 28 ) . gather32_32 < > ( pal , ( GSVector4i * ) & dst [ 16 ] ) ;
2010-04-25 00:31:27 +00:00
2009-02-09 21:15:56 +00:00
dst + = dstpitch ;
}
2009-05-14 16:41:52 +00:00
# else
2009-02-09 21:15:56 +00:00
2016-05-11 23:24:37 +00:00
alignas ( 32 ) uint32 block [ 8 * 8 ] ;
2009-02-09 21:15:56 +00:00
2013-06-10 15:56:11 +00:00
ReadBlock32 ( src , ( uint8 * ) block , sizeof ( block ) / 8 ) ;
2009-02-09 21:15:56 +00:00
ExpandBlock4HH_32 ( block , dst , dstpitch , pal ) ;
# endif
}
// TODO: ReadAndExpandBlock4HH_16
} ;