mirror of https://github.com/PCSX2/pcsx2.git
IPU:
* Savestate mega-fix! Removed all the old direct pointer types from decoder_t, which should fix the oddball random savestate crashes when IPU is active. * Moved iq/niq into decoder_t. * Moved all macroblocks into decoder_t (mb8, mb16, rgb16, rgb32). * Turned decoder.stride into a constant, since IPU can only decode in strides of 16 bytes only. * Added sanity checking to the ipu0_fifo stuff (was formerly g_nIPU0Data, etc). * Added some SSE moves to the Idct (very minor optimization). There's a completely SSE from-ground-up implementation provided by newer versions of libmpeg2 that we should probably look into later, rather than rolling our own. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3587 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
63a2d9c228
commit
61a4b537c4
12
pcsx2/Dmac.h
12
pcsx2/Dmac.h
|
@ -161,8 +161,8 @@ union tDMA_SADR {
|
||||||
|
|
||||||
union tDMA_MADR {
|
union tDMA_MADR {
|
||||||
struct {
|
struct {
|
||||||
u32 ADDR : 31; // Transfer memory address
|
u32 ADDR : 31; // Transfer memory address
|
||||||
u32 SPR : 1; // Memory/SPR Address
|
u32 SPR : 1; // Memory/SPR Address
|
||||||
};
|
};
|
||||||
u32 _u32;
|
u32 _u32;
|
||||||
|
|
||||||
|
@ -175,8 +175,8 @@ union tDMA_MADR {
|
||||||
|
|
||||||
union tDMA_TADR {
|
union tDMA_TADR {
|
||||||
struct {
|
struct {
|
||||||
u32 ADDR : 31; // Next Tag address
|
u32 ADDR : 31; // Next Tag address
|
||||||
u32 SPR : 1; // Memory/SPR Address
|
u32 SPR : 1; // Memory/SPR Address
|
||||||
};
|
};
|
||||||
u32 _u32;
|
u32 _u32;
|
||||||
|
|
||||||
|
@ -190,8 +190,8 @@ union tDMA_TADR {
|
||||||
// The Address Stack Register
|
// The Address Stack Register
|
||||||
union tDMA_ASR {
|
union tDMA_ASR {
|
||||||
struct {
|
struct {
|
||||||
u32 ADDR : 31; // Tag memory address
|
u32 ADDR : 31; // Tag memory address
|
||||||
u32 SPR : 1; // Memory/SPR Address
|
u32 SPR : 1; // Memory/SPR Address
|
||||||
};
|
};
|
||||||
u32 _u32;
|
u32 _u32;
|
||||||
|
|
||||||
|
|
|
@ -94,27 +94,7 @@ void __fastcall ReadFIFO_page_6(u32 mem, u64 *out)
|
||||||
out[1] = psHu64(GIF_FIFO + 8);
|
out[1] = psHu64(GIF_FIFO + 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
void __fastcall ReadFIFO_page_7(u32 mem, u64 *out)
|
// ReadFIFO_page_7 is contained in IPU_Fifo.cpp
|
||||||
{
|
|
||||||
pxAssert( (mem >= IPUout_FIFO) && (mem < D0_CHCR) );
|
|
||||||
|
|
||||||
// All addresses in this page map to 0x7000 and 0x7010:
|
|
||||||
mem &= 0x10;
|
|
||||||
|
|
||||||
if( mem == 0 ) // IPUout_FIFO
|
|
||||||
{
|
|
||||||
if( g_nIPU0Data > 0 )
|
|
||||||
{
|
|
||||||
out[0] = *(u64*)(g_pIPU0Pointer);
|
|
||||||
out[1] = *(u64*)(g_pIPU0Pointer+8);
|
|
||||||
ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
|
|
||||||
g_nIPU0Data--;
|
|
||||||
g_pIPU0Pointer += 16;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else // IPUin_FIFO
|
|
||||||
ipu_fifo.out.readsingle((void*)out);
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// WriteFIFO Pages
|
// WriteFIFO Pages
|
||||||
|
|
|
@ -43,12 +43,6 @@ tIPU_DMA g_nDMATransfer(0);
|
||||||
tIPU_cmd ipu_cmd;
|
tIPU_cmd ipu_cmd;
|
||||||
IPUStatus IPU1Status;
|
IPUStatus IPU1Status;
|
||||||
|
|
||||||
// FIXME - g_nIPU0Data and Pointer are not saved in the savestate, which breaks savestates for some
|
|
||||||
// FMVs at random (if they get saved during the half frame of a 30fps rate). The fix is complicated
|
|
||||||
// since coroutine is such a pita. (air)
|
|
||||||
int g_nIPU0Data = 0; // data left to transfer
|
|
||||||
u8* g_pIPU0Pointer = NULL;
|
|
||||||
|
|
||||||
void ReorderBitstream();
|
void ReorderBitstream();
|
||||||
|
|
||||||
// the BP doesn't advance and returns -1 if there is no data to be read
|
// the BP doesn't advance and returns -1 if there is no data to be read
|
||||||
|
@ -59,72 +53,36 @@ void IPUWorker();
|
||||||
// Color conversion stuff, the memory layout is a total hack
|
// Color conversion stuff, the memory layout is a total hack
|
||||||
// convert_data_buffer is a pointer to the internal rgb struct (the first param in convert_init_t)
|
// convert_data_buffer is a pointer to the internal rgb struct (the first param in convert_init_t)
|
||||||
//char convert_data_buffer[sizeof(convert_rgb_t)];
|
//char convert_data_buffer[sizeof(convert_rgb_t)];
|
||||||
char convert_data_buffer[0x1C];
|
//char convert_data_buffer[0x1C]; // unused?
|
||||||
|
//u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'}; // unused?
|
||||||
|
|
||||||
// Quantization matrix
|
// Quantization matrix
|
||||||
// Pointers outside of IPU.cpp point to niq & iq. As such, all hell breaks loose under gcc if you make them static.
|
static u16 vqclut[16]; //clut conversion table
|
||||||
u8 niq[64]; //non-intraquant matrix
|
static u8 s_thresh[2]; //thresholds for color conversions
|
||||||
u8 iq[64]; //intraquant matrix
|
|
||||||
u16 vqclut[16]; //clut conversion table
|
|
||||||
static u8 s_thresh[2]; //thresholds for color conversions
|
|
||||||
int coded_block_pattern = 0;
|
int coded_block_pattern = 0;
|
||||||
|
|
||||||
__aligned16 macroblock_8 mb8;
|
|
||||||
__aligned16 macroblock_16 mb16;
|
|
||||||
__aligned16 macroblock_rgb32 rgb32;
|
|
||||||
__aligned16 macroblock_rgb16 rgb16;
|
|
||||||
|
|
||||||
u8 indx4[16*16/2];
|
u8 indx4[16*16/2];
|
||||||
bool mpeg2_inited = false; //mpeg2_idct_init() must be called only once
|
__aligned16 decoder_t decoder;
|
||||||
u8 PCT[] = {'r', 'I', 'P', 'B', 'D', '-', '-', '-'};
|
|
||||||
__aligned16 decoder_t decoder; //static, only to place it in bss
|
|
||||||
|
|
||||||
extern "C"
|
|
||||||
{
|
|
||||||
extern u8 mpeg2_scan_norm[64];
|
|
||||||
extern u8 mpeg2_scan_alt[64];
|
|
||||||
}
|
|
||||||
|
|
||||||
__aligned16 u8 _readbits[80]; //local buffer (ring buffer)
|
__aligned16 u8 _readbits[80]; //local buffer (ring buffer)
|
||||||
u8* readbits = _readbits; // always can decrement by one 1qw
|
u8* readbits = _readbits; // always can decrement by one 1qw
|
||||||
|
|
||||||
__forceinline void IPUProcessInterrupt()
|
__forceinline void IPUProcessInterrupt()
|
||||||
{
|
{
|
||||||
if (ipuRegs->ctrl.BUSY && g_BP.IFC) IPUWorker();
|
if (ipuRegs->ctrl.BUSY && g_BP.IFC) IPUWorker();
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_g_decoder()
|
|
||||||
{
|
|
||||||
//other stuff
|
|
||||||
decoder.intra_quantizer_matrix = (u8*)iq;
|
|
||||||
decoder.non_intra_quantizer_matrix = (u8*)niq;
|
|
||||||
decoder.picture_structure = FRAME_PICTURE; //default: progressive...my guess:P
|
|
||||||
decoder.stride = 16;
|
|
||||||
}
|
|
||||||
|
|
||||||
void mpeg2_init()
|
|
||||||
{
|
|
||||||
if (!mpeg2_inited)
|
|
||||||
{
|
|
||||||
mpeg2_idct_init();
|
|
||||||
yuv2rgb_init();
|
|
||||||
memzero(mb8.Y);
|
|
||||||
memzero(mb8.Cb);
|
|
||||||
memzero(mb8.Cr);
|
|
||||||
memzero(mb16.Y);
|
|
||||||
memzero(mb16.Cb);
|
|
||||||
memzero(mb16.Cr);
|
|
||||||
mpeg2_inited = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// Register accesses (run on EE thread)
|
// Register accesses (run on EE thread)
|
||||||
int ipuInit()
|
int ipuInit()
|
||||||
{
|
{
|
||||||
memzero(*ipuRegs);
|
memzero(*ipuRegs);
|
||||||
memzero(g_BP);
|
memzero(g_BP);
|
||||||
init_g_decoder();
|
memzero(decoder);
|
||||||
|
|
||||||
|
decoder.picture_structure = FRAME_PICTURE; //default: progressive...my guess:P
|
||||||
|
|
||||||
g_nDMATransfer.reset();
|
g_nDMATransfer.reset();
|
||||||
IPU1Status.InProgress = false;
|
IPU1Status.InProgress = false;
|
||||||
IPU1Status.DMAMode = DMA_MODE_NORMAL;
|
IPU1Status.DMAMode = DMA_MODE_NORMAL;
|
||||||
|
@ -149,18 +107,16 @@ void ReportIPU()
|
||||||
Console.WriteLn(ipu_fifo.in.desc());
|
Console.WriteLn(ipu_fifo.in.desc());
|
||||||
Console.WriteLn(ipu_fifo.out.desc());
|
Console.WriteLn(ipu_fifo.out.desc());
|
||||||
Console.WriteLn(g_BP.desc());
|
Console.WriteLn(g_BP.desc());
|
||||||
Console.WriteLn("niq = 0x%x, iq = 0x%x.", niq, iq);
|
|
||||||
Console.WriteLn("vqclut = 0x%x.", vqclut);
|
Console.WriteLn("vqclut = 0x%x.", vqclut);
|
||||||
Console.WriteLn("s_thresh = 0x%x.", s_thresh);
|
Console.WriteLn("s_thresh = 0x%x.", s_thresh);
|
||||||
Console.WriteLn("coded_block_pattern = 0x%x.", coded_block_pattern);
|
Console.WriteLn("coded_block_pattern = 0x%x.", coded_block_pattern);
|
||||||
Console.WriteLn("g_decoder = 0x%x.", decoder);
|
Console.WriteLn("g_decoder = 0x%x.", &decoder);
|
||||||
Console.WriteLn("mpeg2: scan_norm = 0x%x, alt = 0x%x.", mpeg2_scan_norm, mpeg2_scan_alt);
|
Console.WriteLn("mpeg2_scan = 0x%x.", &mpeg2_scan);
|
||||||
Console.WriteLn(ipu_cmd.desc());
|
Console.WriteLn(ipu_cmd.desc());
|
||||||
Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
|
Console.WriteLn("_readbits = 0x%x. readbits - _readbits, which is also frozen, is 0x%x.",
|
||||||
_readbits, readbits - _readbits);
|
_readbits, readbits - _readbits);
|
||||||
Console.Newline();
|
Console.Newline();
|
||||||
}
|
}
|
||||||
// fixme - ipuFreeze looks fairly broken. Should probably take a closer look at some point.
|
|
||||||
|
|
||||||
void SaveStateBase::ipuFreeze()
|
void SaveStateBase::ipuFreeze()
|
||||||
{
|
{
|
||||||
|
@ -168,24 +124,15 @@ void SaveStateBase::ipuFreeze()
|
||||||
//ReportIPU();
|
//ReportIPU();
|
||||||
FreezeTag("IPU");
|
FreezeTag("IPU");
|
||||||
|
|
||||||
// old versions saved the IPU regs, but they're already saved as part of HW!
|
|
||||||
//FreezeMem(ipuRegs, sizeof(IPUregisters));
|
|
||||||
|
|
||||||
Freeze(g_nDMATransfer);
|
Freeze(g_nDMATransfer);
|
||||||
Freeze(ipu_fifo);
|
Freeze(ipu_fifo);
|
||||||
|
|
||||||
Freeze(g_BP);
|
Freeze(g_BP);
|
||||||
Freeze(niq);
|
|
||||||
Freeze(iq);
|
|
||||||
Freeze(vqclut);
|
Freeze(vqclut);
|
||||||
Freeze(s_thresh);
|
Freeze(s_thresh);
|
||||||
Freeze(coded_block_pattern);
|
Freeze(coded_block_pattern);
|
||||||
Freeze(decoder);
|
Freeze(decoder);
|
||||||
Freeze(mpeg2_scan_norm);
|
|
||||||
Freeze(mpeg2_scan_alt);
|
|
||||||
|
|
||||||
Freeze(ipu_cmd);
|
Freeze(ipu_cmd);
|
||||||
|
|
||||||
Freeze(_readbits);
|
Freeze(_readbits);
|
||||||
|
|
||||||
int temp = readbits - _readbits;
|
int temp = readbits - _readbits;
|
||||||
|
@ -194,16 +141,9 @@ void SaveStateBase::ipuFreeze()
|
||||||
if (IsLoading())
|
if (IsLoading())
|
||||||
{
|
{
|
||||||
readbits = _readbits;
|
readbits = _readbits;
|
||||||
init_g_decoder();
|
|
||||||
mpeg2_init();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ipuCanFreeze()
|
|
||||||
{
|
|
||||||
return (ipu_cmd.current == -1);
|
|
||||||
}
|
|
||||||
|
|
||||||
__forceinline u32 ipuRead32(u32 mem)
|
__forceinline u32 ipuRead32(u32 mem)
|
||||||
{
|
{
|
||||||
// Note: It's assumed that mem's input value is always in the 0x10002000 page
|
// Note: It's assumed that mem's input value is always in the 0x10002000 page
|
||||||
|
@ -223,7 +163,7 @@ __forceinline u32 ipuRead32(u32 mem)
|
||||||
if (!ipuRegs->ctrl.BUSY)
|
if (!ipuRegs->ctrl.BUSY)
|
||||||
IPU_LOG("Ipu read32: IPU_CTRL=0x%08X %x", ipuRegs->ctrl._u32, cpuRegs.pc);
|
IPU_LOG("Ipu read32: IPU_CTRL=0x%08X %x", ipuRegs->ctrl._u32, cpuRegs.pc);
|
||||||
|
|
||||||
return ipuRegs->ctrl._u32;
|
return ipuRegs->ctrl._u32;
|
||||||
|
|
||||||
ipucase(IPU_BP): // IPU_BP
|
ipucase(IPU_BP): // IPU_BP
|
||||||
ipuRegs->ipubp = g_BP.BP & 0x7f;
|
ipuRegs->ipubp = g_BP.BP & 0x7f;
|
||||||
|
@ -231,7 +171,8 @@ __forceinline u32 ipuRead32(u32 mem)
|
||||||
ipuRegs->ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16;
|
ipuRegs->ipubp |= (g_BP.FP /*+ g_BP.bufferhasnew*/) << 16;
|
||||||
|
|
||||||
IPU_LOG("Ipu read32: IPU_BP=0x%08X", ipuRegs->ipubp);
|
IPU_LOG("Ipu read32: IPU_BP=0x%08X", ipuRegs->ipubp);
|
||||||
return ipuRegs->ipubp;
|
return ipuRegs->ipubp;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
IPU_LOG("Ipu read32: Addr=0x%x Value = 0x%08X", mem, *(u32*)(((u8*)ipuRegs) + mem));
|
IPU_LOG("Ipu read32: Addr=0x%x Value = 0x%08X", mem, *(u32*)(((u8*)ipuRegs) + mem));
|
||||||
}
|
}
|
||||||
|
@ -277,7 +218,6 @@ __forceinline u64 ipuRead64(u32 mem)
|
||||||
|
|
||||||
void ipuSoftReset()
|
void ipuSoftReset()
|
||||||
{
|
{
|
||||||
mpeg2_init();
|
|
||||||
ipu_fifo.clear();
|
ipu_fifo.clear();
|
||||||
|
|
||||||
coded_block_pattern = 0;
|
coded_block_pattern = 0;
|
||||||
|
@ -381,16 +321,16 @@ static BOOL ipuIDEC(u32 val, bool resume)
|
||||||
g_BP.BP += idec.FB;//skip FB bits
|
g_BP.BP += idec.FB;//skip FB bits
|
||||||
//from IPU_CTRL
|
//from IPU_CTRL
|
||||||
ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;)
|
ipuRegs->ctrl.PCT = I_TYPE; //Intra DECoding;)
|
||||||
decoder.coding_type = ipuRegs->ctrl.PCT;
|
decoder.coding_type = ipuRegs->ctrl.PCT;
|
||||||
decoder.mpeg1 = ipuRegs->ctrl.MP1;
|
decoder.mpeg1 = ipuRegs->ctrl.MP1;
|
||||||
decoder.q_scale_type = ipuRegs->ctrl.QST;
|
decoder.q_scale_type = ipuRegs->ctrl.QST;
|
||||||
decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
|
decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
|
||||||
decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
|
decoder.scantype = ipuRegs->ctrl.AS;
|
||||||
decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
|
decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
|
||||||
|
|
||||||
//from IDEC value
|
//from IDEC value
|
||||||
decoder.quantizer_scale = idec.QSC;
|
decoder.quantizer_scale = idec.QSC;
|
||||||
decoder.frame_pred_frame_dct = !idec.DTD;
|
decoder.frame_pred_frame_dct= !idec.DTD;
|
||||||
decoder.sgn = idec.SGN;
|
decoder.sgn = idec.SGN;
|
||||||
decoder.dte = idec.DTE;
|
decoder.dte = idec.DTE;
|
||||||
decoder.ofm = idec.OFM;
|
decoder.ofm = idec.OFM;
|
||||||
|
@ -414,21 +354,21 @@ static __forceinline BOOL ipuBDEC(u32 val, bool resume)
|
||||||
if (IsDebugBuild) s_bdec++;
|
if (IsDebugBuild) s_bdec++;
|
||||||
|
|
||||||
g_BP.BP += bdec.FB;//skip FB bits
|
g_BP.BP += bdec.FB;//skip FB bits
|
||||||
decoder.coding_type = I_TYPE;
|
decoder.coding_type = I_TYPE;
|
||||||
decoder.mpeg1 = ipuRegs->ctrl.MP1;
|
decoder.mpeg1 = ipuRegs->ctrl.MP1;
|
||||||
decoder.q_scale_type = ipuRegs->ctrl.QST;
|
decoder.q_scale_type = ipuRegs->ctrl.QST;
|
||||||
decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
|
decoder.intra_vlc_format = ipuRegs->ctrl.IVF;
|
||||||
decoder.scan = ipuRegs->ctrl.AS ? mpeg2_scan_alt : mpeg2_scan_norm;
|
decoder.scantype = ipuRegs->ctrl.AS;
|
||||||
decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
|
decoder.intra_dc_precision = ipuRegs->ctrl.IDP;
|
||||||
|
|
||||||
//from BDEC value
|
//from BDEC value
|
||||||
decoder.quantizer_scale = decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
|
decoder.quantizer_scale = decoder.q_scale_type ? non_linear_quantizer_scale [bdec.QSC] : bdec.QSC << 1;
|
||||||
decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0;
|
decoder.macroblock_modes = bdec.DT ? DCT_TYPE_INTERLACED : 0;
|
||||||
decoder.dcr = bdec.DCR;
|
decoder.dcr = bdec.DCR;
|
||||||
decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;
|
decoder.macroblock_modes |= bdec.MBI ? MACROBLOCK_INTRA : MACROBLOCK_PATTERN;
|
||||||
|
|
||||||
memzero_sse_a(mb8);
|
memzero_sse_a(decoder.mb8);
|
||||||
memzero_sse_a(mb16);
|
memzero_sse_a(decoder.mb16);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mpeg2_slice();
|
return mpeg2_slice();
|
||||||
|
@ -516,6 +456,8 @@ static BOOL ipuSETIQ(u32 val)
|
||||||
|
|
||||||
if ((val >> 27) & 1)
|
if ((val >> 27) & 1)
|
||||||
{
|
{
|
||||||
|
u8 (&niq)[64] = decoder.niq;
|
||||||
|
|
||||||
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
|
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
|
||||||
{
|
{
|
||||||
if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
if (!getBits64((u8*)niq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
||||||
|
@ -531,6 +473,8 @@ static BOOL ipuSETIQ(u32 val)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
u8 (&iq)[64] = decoder.iq;
|
||||||
|
|
||||||
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
|
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
|
||||||
{
|
{
|
||||||
if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
if (!getBits64((u8*)iq + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
||||||
|
@ -552,7 +496,7 @@ static BOOL ipuSETVQ(u32 val)
|
||||||
{
|
{
|
||||||
for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++)
|
for(;ipu_cmd.pos[0] < 4; ipu_cmd.pos[0]++)
|
||||||
{
|
{
|
||||||
if (!getBits64((u8*)vqclut + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
if (!getBits64(((u8*)vqclut) + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO.");
|
IPU_LOG("IPU SETVQ command.\nRead VQCLUT table from IPU FIFO.");
|
||||||
|
@ -591,17 +535,17 @@ static BOOL __fastcall ipuCSC(u32 val)
|
||||||
{
|
{
|
||||||
for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++)
|
for(;ipu_cmd.pos[0] < 48; ipu_cmd.pos[0]++)
|
||||||
{
|
{
|
||||||
if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
ipu_csc(mb8, rgb32, 0);
|
ipu_csc(decoder.mb8, decoder.rgb32, 0);
|
||||||
if (csc.OFM) ipu_dither(rgb32, rgb16, csc.DTE);
|
if (csc.OFM) ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
|
||||||
|
|
||||||
if (csc.OFM)
|
if (csc.OFM)
|
||||||
{
|
{
|
||||||
while (ipu_cmd.pos[1] < 32)
|
while (ipu_cmd.pos[1] < 32)
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
|
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
|
||||||
|
|
||||||
if (ipu_cmd.pos[1] <= 0) return FALSE;
|
if (ipu_cmd.pos[1] <= 0) return FALSE;
|
||||||
}
|
}
|
||||||
|
@ -610,7 +554,7 @@ static BOOL __fastcall ipuCSC(u32 val)
|
||||||
{
|
{
|
||||||
while (ipu_cmd.pos[1] < 64)
|
while (ipu_cmd.pos[1] < 64)
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]);
|
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb32) + 4 * ipu_cmd.pos[1], 64 - ipu_cmd.pos[1]);
|
||||||
|
|
||||||
if (ipu_cmd.pos[1] <= 0) return FALSE;
|
if (ipu_cmd.pos[1] <= 0) return FALSE;
|
||||||
}
|
}
|
||||||
|
@ -633,17 +577,17 @@ static BOOL ipuPACK(u32 val)
|
||||||
{
|
{
|
||||||
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
|
for(;ipu_cmd.pos[0] < 8; ipu_cmd.pos[0]++)
|
||||||
{
|
{
|
||||||
if (!getBits64((u8*)&mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
if (!getBits64((u8*)&decoder.mb8 + 8 * ipu_cmd.pos[0], 1)) return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
ipu_csc(mb8, rgb32, 0);
|
ipu_csc(decoder.mb8, decoder.rgb32, 0);
|
||||||
ipu_dither(rgb32, rgb16, csc.DTE);
|
ipu_dither(decoder.rgb32, decoder.rgb16, csc.DTE);
|
||||||
|
|
||||||
if (csc.OFM) ipu_vq(rgb16, indx4);
|
if (csc.OFM) ipu_vq(decoder.rgb16, indx4);
|
||||||
|
|
||||||
if (csc.OFM)
|
if (csc.OFM)
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
|
ipu_cmd.pos[1] += ipu_fifo.out.write(((u32*) & decoder.rgb16) + 4 * ipu_cmd.pos[1], 32 - ipu_cmd.pos[1]);
|
||||||
|
|
||||||
if (ipu_cmd.pos[1] < 32) return FALSE;
|
if (ipu_cmd.pos[1] < 32) return FALSE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -342,8 +342,6 @@ struct tIPU_cmd
|
||||||
|
|
||||||
extern tIPU_cmd ipu_cmd;
|
extern tIPU_cmd ipu_cmd;
|
||||||
extern int coded_block_pattern;
|
extern int coded_block_pattern;
|
||||||
extern int g_nIPU0Data; // or 0x80000000 whenever transferring
|
|
||||||
extern u8* g_pIPU0Pointer;
|
|
||||||
extern IPUStatus IPU1Status;
|
extern IPUStatus IPU1Status;
|
||||||
extern tIPU_DMA g_nDMATransfer;
|
extern tIPU_DMA g_nDMATransfer;
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
#include "mpeg2lib/Mpeg.h"
|
#include "mpeg2lib/Mpeg.h"
|
||||||
|
|
||||||
|
|
||||||
IPU_Fifo ipu_fifo;
|
__aligned16 IPU_Fifo ipu_fifo;
|
||||||
|
|
||||||
void IPU_Fifo::init()
|
void IPU_Fifo::init()
|
||||||
{
|
{
|
||||||
|
@ -167,3 +167,32 @@ void IPU_Fifo_Output::readsingle(void *value)
|
||||||
_readsingle(value);
|
_readsingle(value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__forceinline bool decoder_t::ReadIpuData(u128* out)
|
||||||
|
{
|
||||||
|
if(decoder.ipu0_data == 0) return false;
|
||||||
|
_mm_store_ps((float*)out, _mm_load_ps((float*)GetIpuDataPtr()));
|
||||||
|
|
||||||
|
--ipu0_data;
|
||||||
|
++ipu0_idx;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void __fastcall ReadFIFO_page_7(u32 mem, u64 *out)
|
||||||
|
{
|
||||||
|
pxAssert( (mem >= IPUout_FIFO) && (mem < D0_CHCR) );
|
||||||
|
|
||||||
|
// All addresses in this page map to 0x7000 and 0x7010:
|
||||||
|
mem &= 0x10;
|
||||||
|
|
||||||
|
if (mem == 0) // IPUout_FIFO
|
||||||
|
{
|
||||||
|
if (decoder.ReadIpuData((u128*)out))
|
||||||
|
{
|
||||||
|
ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else // IPUin_FIFO
|
||||||
|
ipu_fifo.out.readsingle((void*)out);
|
||||||
|
}
|
||||||
|
|
|
@ -16,12 +16,14 @@
|
||||||
#ifndef IPU_FIFO_H_INCLUDED
|
#ifndef IPU_FIFO_H_INCLUDED
|
||||||
#define IPU_FIFO_H_INCLUDED
|
#define IPU_FIFO_H_INCLUDED
|
||||||
|
|
||||||
class IPU_Fifo_Input
|
// Important! All FIFO containers in this header should be 'struct' type, not class type.
|
||||||
{
|
// They are saved into the savestate as-is, and keeping them as struct ensures that the
|
||||||
public:
|
// layout of their contents is reliable.
|
||||||
|
|
||||||
int readpos, writepos;
|
struct IPU_Fifo_Input
|
||||||
|
{
|
||||||
__aligned16 u32 data[32];
|
__aligned16 u32 data[32];
|
||||||
|
int readpos, writepos;
|
||||||
|
|
||||||
int write(u32* pMem, int size);
|
int write(u32* pMem, int size);
|
||||||
int read(void *value);
|
int read(void *value);
|
||||||
|
@ -29,12 +31,10 @@ class IPU_Fifo_Input
|
||||||
wxString desc() const;
|
wxString desc() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
class IPU_Fifo_Output
|
struct IPU_Fifo_Output
|
||||||
{
|
{
|
||||||
public:
|
|
||||||
|
|
||||||
int readpos, writepos;
|
|
||||||
__aligned16 u32 data[32];
|
__aligned16 u32 data[32];
|
||||||
|
int readpos, writepos;
|
||||||
|
|
||||||
// returns number of qw read
|
// returns number of qw read
|
||||||
int write(const u32 * value, int size);
|
int write(const u32 * value, int size);
|
||||||
|
@ -42,20 +42,19 @@ class IPU_Fifo_Output
|
||||||
void readsingle(void *value);
|
void readsingle(void *value);
|
||||||
void clear();
|
void clear();
|
||||||
wxString desc() const;
|
wxString desc() const;
|
||||||
private:
|
|
||||||
void _readsingle(void *value);
|
void _readsingle(void *value);
|
||||||
};
|
};
|
||||||
|
|
||||||
class IPU_Fifo
|
struct IPU_Fifo
|
||||||
{
|
{
|
||||||
public:
|
__aligned16 IPU_Fifo_Input in;
|
||||||
IPU_Fifo_Input in;
|
__aligned16 IPU_Fifo_Output out;
|
||||||
IPU_Fifo_Output out;
|
|
||||||
|
|
||||||
void init();
|
void init();
|
||||||
void clear();
|
void clear();
|
||||||
};
|
};
|
||||||
|
|
||||||
extern IPU_Fifo ipu_fifo;
|
extern __aligned16 IPU_Fifo ipu_fifo;
|
||||||
|
|
||||||
#endif // IPU_FIFO_H_INCLUDED
|
#endif // IPU_FIFO_H_INCLUDED
|
||||||
|
|
|
@ -22,10 +22,15 @@
|
||||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// [TODO] : There are modern SSE versions of idct (idct_mmx.c) in the mpeg2 libs that we
|
||||||
|
// should probably upgrade to. They use their own raw-style intrinsics and not the intel
|
||||||
|
// compiler-integrated ones.
|
||||||
|
|
||||||
#include "PrecompiledHeader.h"
|
#include "PrecompiledHeader.h"
|
||||||
|
|
||||||
#include "Common.h"
|
#include "Common.h"
|
||||||
#include "IPU/IPU.h"
|
#include "IPU/IPU.h"
|
||||||
|
#include "Mpeg.h"
|
||||||
|
|
||||||
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
|
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
|
||||||
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
|
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
|
||||||
|
@ -36,19 +41,14 @@
|
||||||
#define clp(val,res) res = (val < 0) ? 0 : ((val > 255) ? 255 : val);
|
#define clp(val,res) res = (val < 0) ? 0 : ((val > 255) ? 255 : val);
|
||||||
#define clp2(val,res) res = (val < -255) ? -255 : ((val > 255) ? 255 : val);
|
#define clp2(val,res) res = (val < -255) ? -255 : ((val > 255) ? 255 : val);
|
||||||
|
|
||||||
/* idct main entry point */
|
|
||||||
void (__fastcall *mpeg2_idct_copy) (s16 * block, u8 * dest, int stride);
|
|
||||||
/* JayteeMaster: changed dest to 16 bit signed */
|
|
||||||
void (__fastcall *mpeg2_idct_add) (int last, s16 * block,
|
|
||||||
/*u8*/s16 * dest, int stride);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In legal streams, the IDCT output should be between -384 and +384.
|
* In legal streams, the IDCT output should be between -384 and +384.
|
||||||
* In corrupted streams, it is possible to force the IDCT output to go
|
* In corrupted streams, it is possible to force the IDCT output to go
|
||||||
* to +-3826 - this is the worst case for a column IDCT where the
|
* to +-3826 - this is the worst case for a column IDCT where the
|
||||||
* column inputs are 16-bit values.
|
* column inputs are 16-bit values.
|
||||||
*/
|
*/
|
||||||
static u8 clip_lut[1024];
|
static __aligned16 u8 clip_lut[1024];
|
||||||
|
|
||||||
#define CLIP(i) ((clip_lut+384)[(i)])
|
#define CLIP(i) ((clip_lut+384)[(i)])
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
@ -75,13 +75,13 @@ static __forceinline void idct_row (s16 * const block)
|
||||||
/* shortcut */
|
/* shortcut */
|
||||||
if (!(block[1] | ((s32 *)block)[1] | ((s32 *)block)[2] |
|
if (!(block[1] | ((s32 *)block)[1] | ((s32 *)block)[2] |
|
||||||
((s32 *)block)[3])) {
|
((s32 *)block)[3])) {
|
||||||
u32 tmp = (u16) (block[0] << 3);
|
u32 tmp = (u16) (block[0] << 3);
|
||||||
tmp |= tmp << 16;
|
tmp |= tmp << 16;
|
||||||
((s32 *)block)[0] = tmp;
|
((s32 *)block)[0] = tmp;
|
||||||
((s32 *)block)[1] = tmp;
|
((s32 *)block)[1] = tmp;
|
||||||
((s32 *)block)[2] = tmp;
|
((s32 *)block)[2] = tmp;
|
||||||
((s32 *)block)[3] = tmp;
|
((s32 *)block)[3] = tmp;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
d0 = (block[0] << 11) + 128;
|
d0 = (block[0] << 11) + 128;
|
||||||
|
@ -160,122 +160,97 @@ static __forceinline void idct_col (s16 * const block)
|
||||||
block[8*7] = (a0 - b0) >> 17;
|
block[8*7] = (a0 - b0) >> 17;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __fastcall mpeg2_idct_copy_c (s16 * block, u8 * dest,
|
__releaseinline void mpeg2_idct_copy(s16 * block, u8 * dest, const int stride)
|
||||||
const int stride)
|
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i = 0; i < 8; i++)
|
for (i = 0; i < 8; i++)
|
||||||
idct_row (block + 8 * i);
|
idct_row (block + 8 * i);
|
||||||
for (i = 0; i < 8; i++)
|
for (i = 0; i < 8; i++)
|
||||||
idct_col (block + i);
|
idct_col (block + i);
|
||||||
|
|
||||||
|
__m128 zero = _mm_setzero_ps();
|
||||||
do {
|
do {
|
||||||
dest[0] = CLIP (block[0]);
|
dest[0] = CLIP (block[0]);
|
||||||
dest[1] = CLIP (block[1]);
|
dest[1] = CLIP (block[1]);
|
||||||
dest[2] = CLIP (block[2]);
|
dest[2] = CLIP (block[2]);
|
||||||
dest[3] = CLIP (block[3]);
|
dest[3] = CLIP (block[3]);
|
||||||
dest[4] = CLIP (block[4]);
|
dest[4] = CLIP (block[4]);
|
||||||
dest[5] = CLIP (block[5]);
|
dest[5] = CLIP (block[5]);
|
||||||
dest[6] = CLIP (block[6]);
|
dest[6] = CLIP (block[6]);
|
||||||
dest[7] = CLIP (block[7]);
|
dest[7] = CLIP (block[7]);
|
||||||
|
|
||||||
block[0] = 0; block[1] = 0; block[2] = 0; block[3] = 0;
|
_mm_store_ps((float*)block, zero);
|
||||||
block[4] = 0; block[5] = 0; block[6] = 0; block[7] = 0;
|
|
||||||
|
|
||||||
dest += stride;
|
dest += stride;
|
||||||
block += 8;
|
block += 8;
|
||||||
} while (--i);
|
} while (--i);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* JayteeMaster: changed dest to 16 bit signed */
|
|
||||||
static void __fastcall mpeg2_idct_add_c (const int last, s16 * block,
|
// stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]).
|
||||||
/*u8*/s16 * dest, const int stride)
|
__releaseinline void mpeg2_idct_add (const int last, s16 * block, s16 * dest, const int stride)
|
||||||
{
|
{
|
||||||
int i;
|
// on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0).
|
||||||
|
|
||||||
if (last != 129 || (block[0] & 7) == 4) {
|
if (last != 129 || (block[0] & 7) == 4)
|
||||||
for (i = 0; i < 8; i++)
|
{
|
||||||
idct_row (block + 8 * i);
|
int i;
|
||||||
for (i = 0; i < 8; i++)
|
for (i = 0; i < 8; i++)
|
||||||
idct_col (block + i);
|
idct_row (block + 8 * i);
|
||||||
do {
|
for (i = 0; i < 8; i++)
|
||||||
dest[0] = block[0];
|
idct_col (block + i);
|
||||||
dest[1] = block[1];
|
|
||||||
dest[2] = block[2];
|
|
||||||
dest[3] = block[3];
|
|
||||||
dest[4] = block[4];
|
|
||||||
dest[5] = block[5];
|
|
||||||
dest[6] = block[6];
|
|
||||||
dest[7] = block[7];
|
|
||||||
|
|
||||||
block[0] = 0; block[1] = 0; block[2] = 0; block[3] = 0;
|
__m128 zero = _mm_setzero_ps();
|
||||||
block[4] = 0; block[5] = 0; block[6] = 0; block[7] = 0;
|
do {
|
||||||
|
_mm_store_ps((float*)dest, _mm_load_ps((float*)block));
|
||||||
|
_mm_store_ps((float*)block, zero);
|
||||||
|
|
||||||
dest += stride;
|
dest += stride;
|
||||||
block += 8;
|
block += 8;
|
||||||
} while (--i);
|
} while (--i);
|
||||||
} else {
|
|
||||||
int DC;
|
|
||||||
|
|
||||||
DC = (block[0] + 4) >> 3;
|
}
|
||||||
block[0] = block[63] = 0;
|
else
|
||||||
i = 8;
|
{
|
||||||
do {
|
int DC = (block[0] + 4) >> 3;
|
||||||
dest[0] = DC;
|
s16 dcf[2] = { DC, DC };
|
||||||
dest[1] = DC;
|
block[0] = block[63] = 0;
|
||||||
dest[2] = DC;
|
|
||||||
dest[3] = DC;
|
__m128 dc128 = _mm_set_ps1(*(float*)dcf);
|
||||||
dest[4] = DC;
|
|
||||||
dest[5] = DC;
|
for(int i=0; i<8; ++i)
|
||||||
dest[6] = DC;
|
_mm_store_ps((float*)(dest+(stride*i)), dc128);
|
||||||
dest[7] = DC;
|
|
||||||
dest += stride;
|
|
||||||
} while (--i);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
mpeg2_scan_pack::mpeg2_scan_pack()
|
||||||
{
|
{
|
||||||
u8 mpeg2_scan_norm[64] = {
|
static const u8 mpeg2_scan_norm[64] = {
|
||||||
/* Zig-Zag scan pattern */
|
/* Zig-Zag scan pattern */
|
||||||
0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
|
0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
|
||||||
12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
|
12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
|
||||||
35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
|
35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
|
||||||
58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
|
58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
|
||||||
};
|
};
|
||||||
|
|
||||||
u8 mpeg2_scan_alt[64] = {
|
static const u8 mpeg2_scan_alt[64] = {
|
||||||
/* Alternate scan pattern */
|
/* Alternate scan pattern */
|
||||||
0, 8, 16, 24, 1, 9, 2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
|
0, 8, 16, 24, 1, 9, 2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
|
||||||
41, 33, 26, 18, 3, 11, 4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
|
41, 33, 26, 18, 3, 11, 4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
|
||||||
51, 59, 20, 28, 5, 13, 6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
|
51, 59, 20, 28, 5, 13, 6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
|
||||||
53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
|
53, 61, 22, 30, 7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
|
||||||
};
|
};
|
||||||
};
|
|
||||||
|
|
||||||
// The MMX verson wasn't being used and it was only available as a .obj,
|
for (int i = -384; i < 640; i++)
|
||||||
// so I removed it (gigaherz).
|
|
||||||
///* idct_mmx.c */
|
|
||||||
//void mpeg2_idct_copy_mmxext (s16 * block, u8 * dest, int stride);
|
|
||||||
//void mpeg2_idct_add_mmxext (int last, s16 * block,
|
|
||||||
// s16 * dest, int stride);
|
|
||||||
//void mpeg2_idct_copy_mmx (s16 * block, u8 * dest, int stride);
|
|
||||||
//void mpeg2_idct_add_mmx (int last, s16 * block,
|
|
||||||
// s16 * dest, int stride);
|
|
||||||
//void mpeg2_idct_mmx_init (void);
|
|
||||||
|
|
||||||
void mpeg2_idct_init()
|
|
||||||
{
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
mpeg2_idct_copy = mpeg2_idct_copy_c;
|
|
||||||
mpeg2_idct_add = mpeg2_idct_add_c;
|
|
||||||
for (i = -384; i < 640; i++)
|
|
||||||
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
|
clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
|
||||||
for (i = 0; i < 64; i++) {
|
|
||||||
j = mpeg2_scan_norm[i];
|
for (int i = 0; i < 64; i++) {
|
||||||
mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
int j = mpeg2_scan_norm[i];
|
||||||
|
norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
||||||
j = mpeg2_scan_alt[i];
|
j = mpeg2_scan_alt[i];
|
||||||
mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const __aligned16 mpeg2_scan_pack mpeg2_scan;
|
||||||
|
|
|
@ -33,7 +33,7 @@
|
||||||
#include "Mpeg.h"
|
#include "Mpeg.h"
|
||||||
#include "Vlc.h"
|
#include "Vlc.h"
|
||||||
|
|
||||||
int non_linear_quantizer_scale [] =
|
const int non_linear_quantizer_scale [] =
|
||||||
{
|
{
|
||||||
0, 1, 2, 3, 4, 5, 6, 7,
|
0, 1, 2, 3, 4, 5, 6, 7,
|
||||||
8, 10, 12, 14, 16, 18, 20, 22,
|
8, 10, 12, 14, 16, 18, 20, 22,
|
||||||
|
@ -341,8 +341,8 @@ static __forceinline bool get_intra_block()
|
||||||
int i;
|
int i;
|
||||||
int j;
|
int j;
|
||||||
int val;
|
int val;
|
||||||
const u8 * scan = decoder.scan;
|
const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
|
||||||
const u8 * quant_matrix = decoder.intra_quantizer_matrix;
|
const u8 (&quant_matrix)[64] = decoder.iq;
|
||||||
int quantizer_scale = decoder.quantizer_scale;
|
int quantizer_scale = decoder.quantizer_scale;
|
||||||
s16 * dest = decoder.DCTblock;
|
s16 * dest = decoder.DCTblock;
|
||||||
u16 code;
|
u16 code;
|
||||||
|
@ -493,8 +493,8 @@ static __forceinline bool get_non_intra_block(int * last)
|
||||||
int i;
|
int i;
|
||||||
int j;
|
int j;
|
||||||
int val;
|
int val;
|
||||||
const u8 * scan = decoder.scan;
|
const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
|
||||||
const u8 * quant_matrix = decoder.non_intra_quantizer_matrix;
|
const u8 (&quant_matrix)[64] = decoder.niq;
|
||||||
int quantizer_scale = decoder.quantizer_scale;
|
int quantizer_scale = decoder.quantizer_scale;
|
||||||
s16 * dest = decoder.DCTblock;
|
s16 * dest = decoder.DCTblock;
|
||||||
u16 code;
|
u16 code;
|
||||||
|
@ -699,7 +699,6 @@ void __forceinline finishmpeg2sliceIDEC()
|
||||||
|
|
||||||
bool mpeg2sliceIDEC()
|
bool mpeg2sliceIDEC()
|
||||||
{
|
{
|
||||||
u32 read;
|
|
||||||
u16 code;
|
u16 code;
|
||||||
u8 bit8;
|
u8 bit8;
|
||||||
|
|
||||||
|
@ -725,6 +724,10 @@ bool mpeg2sliceIDEC()
|
||||||
ipu_cmd.pos[0] = 2;
|
ipu_cmd.pos[0] = 2;
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
|
macroblock_8& mb8 = decoder.mb8;
|
||||||
|
macroblock_rgb16& rgb16 = decoder.rgb16;
|
||||||
|
macroblock_rgb32& rgb32 = decoder.rgb32;
|
||||||
|
|
||||||
int DCT_offset, DCT_stride;
|
int DCT_offset, DCT_stride;
|
||||||
const MBAtab * mba;
|
const MBAtab * mba;
|
||||||
|
|
||||||
|
@ -747,13 +750,13 @@ bool mpeg2sliceIDEC()
|
||||||
|
|
||||||
if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
|
if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
|
||||||
{
|
{
|
||||||
DCT_offset = decoder.stride;
|
DCT_offset = decoder_stride;
|
||||||
DCT_stride = decoder.stride * 2;
|
DCT_stride = decoder_stride * 2;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
DCT_offset = decoder.stride * 8;
|
DCT_offset = decoder_stride * 8;
|
||||||
DCT_stride = decoder.stride;
|
DCT_stride = decoder_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (ipu_cmd.pos[2])
|
switch (ipu_cmd.pos[2])
|
||||||
|
@ -784,13 +787,13 @@ bool mpeg2sliceIDEC()
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
case 5:
|
case 5:
|
||||||
if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[2] == 5))
|
if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder_stride >> 1, ipu_cmd.pos[2] == 5))
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[2] = 5;
|
ipu_cmd.pos[2] = 5;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
case 6:
|
case 6:
|
||||||
if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[2] == 6))
|
if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder_stride >> 1, ipu_cmd.pos[2] == 6))
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[2] = 6;
|
ipu_cmd.pos[2] = 6;
|
||||||
return false;
|
return false;
|
||||||
|
@ -801,22 +804,17 @@ bool mpeg2sliceIDEC()
|
||||||
ipu_csc(mb8, rgb32, decoder.sgn);
|
ipu_csc(mb8, rgb32, decoder.sgn);
|
||||||
|
|
||||||
if (decoder.ofm == 0)
|
if (decoder.ofm == 0)
|
||||||
{
|
decoder.SetOutputTo(rgb32);
|
||||||
g_nIPU0Data = 64;
|
|
||||||
g_pIPU0Pointer = (u8*)&rgb32;
|
|
||||||
}
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
ipu_dither(rgb32, rgb16, decoder.dte);
|
ipu_dither(rgb32, rgb16, decoder.dte);
|
||||||
|
decoder.SetOutputTo(rgb16);
|
||||||
g_nIPU0Data = 32;
|
|
||||||
g_pIPU0Pointer = (u8*)&rgb16;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
case 2:
|
case 2:
|
||||||
while (g_nIPU0Data > 0)
|
while (decoder.ipu0_data > 0)
|
||||||
{
|
{
|
||||||
read = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data);
|
uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
|
||||||
|
|
||||||
if (read == 0)
|
if (read == 0)
|
||||||
{
|
{
|
||||||
|
@ -825,9 +823,7 @@ bool mpeg2sliceIDEC()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
g_pIPU0Pointer += read * 16;
|
decoder.AdvanceIpuDataBy(read);
|
||||||
g_nIPU0Data -= read;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -932,7 +928,9 @@ bool mpeg2_slice()
|
||||||
{
|
{
|
||||||
int DCT_offset, DCT_stride;
|
int DCT_offset, DCT_stride;
|
||||||
u8 bit8;
|
u8 bit8;
|
||||||
u32 size;
|
|
||||||
|
macroblock_8& mb8 = decoder.mb8;
|
||||||
|
macroblock_16& mb16 = decoder.mb16;
|
||||||
|
|
||||||
switch (ipu_cmd.pos[0])
|
switch (ipu_cmd.pos[0])
|
||||||
{
|
{
|
||||||
|
@ -960,13 +958,13 @@ bool mpeg2_slice()
|
||||||
|
|
||||||
if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
|
if (decoder.macroblock_modes & DCT_TYPE_INTERLACED)
|
||||||
{
|
{
|
||||||
DCT_offset = decoder.stride;
|
DCT_offset = decoder_stride;
|
||||||
DCT_stride = decoder.stride * 2;
|
DCT_stride = decoder_stride * 2;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
DCT_offset = decoder.stride * 8;
|
DCT_offset = decoder_stride * 8;
|
||||||
DCT_stride = decoder.stride;
|
DCT_stride = decoder_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (decoder.macroblock_modes & MACROBLOCK_INTRA)
|
if (decoder.macroblock_modes & MACROBLOCK_INTRA)
|
||||||
|
@ -1000,13 +998,13 @@ bool mpeg2_slice()
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
case 5:
|
case 5:
|
||||||
if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5))
|
if (!slice_intra_DCT(1, (u8*)mb8.Cb, decoder_stride >> 1, ipu_cmd.pos[1] == 5))
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[1] = 5;
|
ipu_cmd.pos[1] = 5;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
case 6:
|
case 6:
|
||||||
if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6))
|
if (!slice_intra_DCT(2, (u8*)mb8.Cr, decoder_stride >> 1, ipu_cmd.pos[1] == 6))
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[1] = 6;
|
ipu_cmd.pos[1] = 6;
|
||||||
return false;
|
return false;
|
||||||
|
@ -1063,7 +1061,7 @@ bool mpeg2_slice()
|
||||||
case 5:
|
case 5:
|
||||||
if (decoder.coded_block_pattern & 0x2)
|
if (decoder.coded_block_pattern & 0x2)
|
||||||
{
|
{
|
||||||
if (!slice_non_intra_DCT((s16*)mb16.Cb, decoder.stride >> 1, ipu_cmd.pos[1] == 5))
|
if (!slice_non_intra_DCT((s16*)mb16.Cb, decoder_stride >> 1, ipu_cmd.pos[1] == 5))
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[1] = 5;
|
ipu_cmd.pos[1] = 5;
|
||||||
return false;
|
return false;
|
||||||
|
@ -1072,7 +1070,7 @@ bool mpeg2_slice()
|
||||||
case 6:
|
case 6:
|
||||||
if (decoder.coded_block_pattern & 0x1)
|
if (decoder.coded_block_pattern & 0x1)
|
||||||
{
|
{
|
||||||
if (!slice_non_intra_DCT((s16*)mb16.Cr, decoder.stride >> 1, ipu_cmd.pos[1] == 6))
|
if (!slice_non_intra_DCT((s16*)mb16.Cr, decoder_stride >> 1, ipu_cmd.pos[1] == 6))
|
||||||
{
|
{
|
||||||
ipu_cmd.pos[1] = 6;
|
ipu_cmd.pos[1] = 6;
|
||||||
return false;
|
return false;
|
||||||
|
@ -1083,8 +1081,7 @@ bool mpeg2_slice()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//Send The MacroBlock via DmaIpuFrom
|
// Send The MacroBlock via DmaIpuFrom
|
||||||
size = 0; // Reset
|
|
||||||
ipuRegs->ctrl.SCD = 0;
|
ipuRegs->ctrl.SCD = 0;
|
||||||
coded_block_pattern = decoder.coded_block_pattern;
|
coded_block_pattern = decoder.coded_block_pattern;
|
||||||
g_BP.BP += (int)decoder.bitstream_bits - 16;
|
g_BP.BP += (int)decoder.bitstream_bits - 16;
|
||||||
|
@ -1101,13 +1098,12 @@ bool mpeg2_slice()
|
||||||
}
|
}
|
||||||
|
|
||||||
decoder.mbc = 1;
|
decoder.mbc = 1;
|
||||||
g_nIPU0Data = 48;
|
decoder.SetOutputTo(mb16);
|
||||||
g_pIPU0Pointer = (u8*)&mb16;
|
|
||||||
|
|
||||||
case 3:
|
case 3:
|
||||||
while (g_nIPU0Data > 0)
|
while (decoder.ipu0_data > 0)
|
||||||
{
|
{
|
||||||
size = ipu_fifo.out.write((u32*)g_pIPU0Pointer, g_nIPU0Data);
|
uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
|
||||||
|
|
||||||
if (size == 0)
|
if (size == 0)
|
||||||
{
|
{
|
||||||
|
@ -1116,8 +1112,7 @@ bool mpeg2_slice()
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
g_pIPU0Pointer += size * 16;
|
decoder.AdvanceIpuDataBy(size);
|
||||||
g_nIPU0Data -= size;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -66,6 +66,8 @@ __noinline void memzero_sse_a( T& dest )
|
||||||
#undef MZFqwc
|
#undef MZFqwc
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// the IPU is fixed to 16 byte strides (128-bit / QWC resolution):
|
||||||
|
static const uint decoder_stride = 16;
|
||||||
|
|
||||||
enum macroblock_modes
|
enum macroblock_modes
|
||||||
{
|
{
|
||||||
|
@ -106,25 +108,25 @@ enum picture_coding_type
|
||||||
};
|
};
|
||||||
|
|
||||||
struct macroblock_8{
|
struct macroblock_8{
|
||||||
unsigned char Y[16][16]; //0
|
u8 Y[16][16]; //0
|
||||||
unsigned char Cb[8][8]; //1
|
u8 Cb[8][8]; //1
|
||||||
unsigned char Cr[8][8]; //2
|
u8 Cr[8][8]; //2
|
||||||
};
|
};
|
||||||
|
|
||||||
struct macroblock_16{
|
struct macroblock_16{
|
||||||
short Y[16][16]; //0
|
s16 Y[16][16]; //0
|
||||||
short Cb[8][8]; //1
|
s16 Cb[8][8]; //1
|
||||||
short Cr[8][8]; //2
|
s16 Cr[8][8]; //2
|
||||||
};
|
};
|
||||||
|
|
||||||
struct macroblock_rgb32{
|
struct macroblock_rgb32{
|
||||||
struct {
|
struct {
|
||||||
unsigned char r, g, b, a;
|
u8 r, g, b, a;
|
||||||
} c[16][16];
|
} c[16][16];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct rgb16_t{
|
struct rgb16_t{
|
||||||
unsigned short r:5, g:5, b:5, a:1;
|
u16 r:5, g:5, b:5, a:1;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct macroblock_rgb16{
|
struct macroblock_rgb16{
|
||||||
|
@ -138,24 +140,26 @@ struct decoder_t {
|
||||||
/* DCT coefficients - should be kept aligned ! */
|
/* DCT coefficients - should be kept aligned ! */
|
||||||
s16 DCTblock[64];
|
s16 DCTblock[64];
|
||||||
|
|
||||||
|
u8 niq[64]; //non-intraquant matrix (sequence header)
|
||||||
|
u8 iq[64]; //intraquant matrix (sequence header)
|
||||||
|
|
||||||
|
macroblock_8 mb8;
|
||||||
|
macroblock_16 mb16;
|
||||||
|
macroblock_rgb32 rgb32;
|
||||||
|
macroblock_rgb16 rgb16;
|
||||||
|
|
||||||
|
uint ipu0_data;
|
||||||
|
uint ipu0_idx;
|
||||||
|
|
||||||
/* bit parsing stuff */
|
/* bit parsing stuff */
|
||||||
u32 bitstream_buf; /* current 32 bit working set */
|
u32 bitstream_buf; /* current 32 bit working set */
|
||||||
int bitstream_bits; /* used bits in working set */
|
int bitstream_bits; /* used bits in working set */
|
||||||
|
|
||||||
int stride;
|
|
||||||
|
|
||||||
/* predictor for DC coefficients in intra blocks */
|
|
||||||
s16 dc_dct_pred[3];
|
|
||||||
|
|
||||||
int quantizer_scale; /* remove */
|
int quantizer_scale; /* remove */
|
||||||
int dmv_offset; /* remove */
|
int dmv_offset; /* remove */
|
||||||
|
|
||||||
/* now non-slice-specific information */
|
/* now non-slice-specific information */
|
||||||
|
|
||||||
/* sequence header stuff */
|
|
||||||
u8 *intra_quantizer_matrix;
|
|
||||||
u8 *non_intra_quantizer_matrix;
|
|
||||||
|
|
||||||
/* picture header stuff */
|
/* picture header stuff */
|
||||||
|
|
||||||
/* what type of picture this is (I, P, B, D) */
|
/* what type of picture this is (I, P, B, D) */
|
||||||
|
@ -163,6 +167,9 @@ struct decoder_t {
|
||||||
|
|
||||||
/* picture coding extension stuff */
|
/* picture coding extension stuff */
|
||||||
|
|
||||||
|
/* predictor for DC coefficients in intra blocks */
|
||||||
|
s16 dc_dct_pred[3];
|
||||||
|
|
||||||
/* quantization factor for intra dc coefficients */
|
/* quantization factor for intra dc coefficients */
|
||||||
int intra_dc_precision;
|
int intra_dc_precision;
|
||||||
/* top/bottom/both fields */
|
/* top/bottom/both fields */
|
||||||
|
@ -195,16 +202,47 @@ struct decoder_t {
|
||||||
|
|
||||||
/* stuff derived from bitstream */
|
/* stuff derived from bitstream */
|
||||||
|
|
||||||
/* pointer to the zigzag scan we're supposed to be using */
|
/* the zigzag scan we're supposed to be using, true for alt, false for normal */
|
||||||
const u8 * scan;
|
bool scantype;
|
||||||
|
|
||||||
int second_field;
|
int second_field;
|
||||||
|
|
||||||
int mpeg1;
|
int mpeg1;
|
||||||
|
|
||||||
|
template< typename T >
|
||||||
|
void SetOutputTo( T& obj )
|
||||||
|
{
|
||||||
|
uint mb_offset = ((uptr)&obj - (uptr)&mb8);
|
||||||
|
pxAssume( (mb_offset & 15) == 0 );
|
||||||
|
ipu0_idx = mb_offset / 16;
|
||||||
|
ipu0_data = sizeof(obj)/16;
|
||||||
|
}
|
||||||
|
|
||||||
|
u128* GetIpuDataPtr()
|
||||||
|
{
|
||||||
|
return ((u128*)&mb8) + ipu0_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
void AdvanceIpuDataBy(uint amt)
|
||||||
|
{
|
||||||
|
pxAssumeDev(ipu0_data>=amt, "IPU FIFO Overflow on advance!" );
|
||||||
|
ipu0_idx += amt;
|
||||||
|
ipu0_data -= amt;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ReadIpuData(u128* out);
|
||||||
};
|
};
|
||||||
|
|
||||||
extern void (__fastcall *mpeg2_idct_copy) (s16 * block, u8* dest, int stride);
|
struct mpeg2_scan_pack
|
||||||
extern void (__fastcall *mpeg2_idct_add) (int last, s16 * block, s16* dest, int stride);
|
{
|
||||||
|
u8 norm[64];
|
||||||
|
u8 alt[64];
|
||||||
|
|
||||||
|
mpeg2_scan_pack();
|
||||||
|
};
|
||||||
|
|
||||||
|
extern void mpeg2_idct_copy(s16 * block, u8* dest, int stride);
|
||||||
|
extern void mpeg2_idct_add(int last, s16 * block, s16* dest, int stride);
|
||||||
|
|
||||||
#define IDEC 0
|
#define IDEC 0
|
||||||
#define BDEC 1
|
#define BDEC 1
|
||||||
|
@ -217,16 +255,12 @@ extern int get_macroblock_modes();
|
||||||
extern int get_motion_delta(const int f_code);
|
extern int get_motion_delta(const int f_code);
|
||||||
extern int get_dmv();
|
extern int get_dmv();
|
||||||
|
|
||||||
extern int non_linear_quantizer_scale[];
|
|
||||||
|
|
||||||
extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
|
extern void ipu_csc(macroblock_8& mb8, macroblock_rgb32& rgb32, int sgn);
|
||||||
extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
|
extern void ipu_dither(const macroblock_rgb32& rgb32, macroblock_rgb16& rgb16, int dte);
|
||||||
extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
|
extern void ipu_vq(macroblock_rgb16& rgb16, u8* indx4);
|
||||||
extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16);
|
extern void ipu_copy(const macroblock_8& mb8, macroblock_16& mb16);
|
||||||
|
|
||||||
extern int slice (u8 * buffer);
|
extern int slice (u8 * buffer);
|
||||||
/* idct.c */
|
|
||||||
extern void mpeg2_idct_init ();
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#define BigEndian(out, in) out = _byteswap_ulong(in)
|
#define BigEndian(out, in) out = _byteswap_ulong(in)
|
||||||
|
@ -240,13 +274,12 @@ extern void mpeg2_idct_init ();
|
||||||
#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
|
#define BigEndian64(out, in) out = __builtin_bswap64(in) // or we could use the asm function bswap...
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
extern __aligned16 const mpeg2_scan_pack mpeg2_scan;
|
||||||
|
extern const int non_linear_quantizer_scale[];
|
||||||
|
|
||||||
// The IPU can only do one task at once and never uses other buffers so all mpeg state variables
|
// The IPU can only do one task at once and never uses other buffers so all mpeg state variables
|
||||||
// are made available to mpeg/vlc modules as globals here:
|
// are made available to mpeg/vlc modules as globals here:
|
||||||
|
|
||||||
extern __aligned16 tIPU_BP g_BP;
|
extern __aligned16 tIPU_BP g_BP;
|
||||||
extern __aligned16 decoder_t decoder;
|
extern __aligned16 decoder_t decoder;
|
||||||
extern __aligned16 macroblock_8 mb8;
|
|
||||||
extern __aligned16 macroblock_16 mb16;
|
|
||||||
extern __aligned16 macroblock_rgb32 rgb32;
|
|
||||||
extern __aligned16 macroblock_rgb16 rgb16;
|
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,9 @@
|
||||||
// conforming implementation for reference, do not optimise
|
// conforming implementation for reference, do not optimise
|
||||||
void yuv2rgb_reference(void)
|
void yuv2rgb_reference(void)
|
||||||
{
|
{
|
||||||
|
const macroblock_8& mb8 = decoder.mb8;
|
||||||
|
macroblock_rgb32& rgb32 = decoder.rgb32;
|
||||||
|
|
||||||
for (int y = 0; y < 16; y++)
|
for (int y = 0; y < 16; y++)
|
||||||
for (int x = 0; x < 16; x++)
|
for (int x = 0; x < 16; x++)
|
||||||
{
|
{
|
||||||
|
@ -124,8 +127,8 @@ __releaseinline void yuv2rgb_sse2(void)
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
tworows:
|
tworows:
|
||||||
movq xmm3, qword ptr [mb8+256+esi]
|
movq xmm3, qword ptr [decoder.mb8+256+esi]
|
||||||
movq xmm1, qword ptr [mb8+320+esi]
|
movq xmm1, qword ptr [decoder.mb8+320+esi]
|
||||||
pxor xmm2, xmm2
|
pxor xmm2, xmm2
|
||||||
pxor xmm0, xmm0
|
pxor xmm0, xmm0
|
||||||
// could skip the movq but punpck requires 128-bit alignment
|
// could skip the movq but punpck requires 128-bit alignment
|
||||||
|
@ -170,7 +173,7 @@ ihatemsvc:
|
||||||
movaps xmm4, xmm1
|
movaps xmm4, xmm1
|
||||||
movaps xmm5, xmm2
|
movaps xmm5, xmm2
|
||||||
|
|
||||||
movaps xmm6, xmmword ptr [mb8+edi]
|
movaps xmm6, xmmword ptr [decoder.mb8+edi]
|
||||||
psubusb xmm6, xmmword ptr [edx+Y_BIAS]
|
psubusb xmm6, xmmword ptr [edx+Y_BIAS]
|
||||||
movaps xmm7, xmm6
|
movaps xmm7, xmm6
|
||||||
psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
|
psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
|
||||||
|
@ -235,10 +238,10 @@ ihatemsvc:
|
||||||
punpckhwd xmm4, xmm5
|
punpckhwd xmm4, xmm5
|
||||||
|
|
||||||
// at last
|
// at last
|
||||||
movaps xmmword ptr [rgb32+edi*4+0], xmm0
|
movaps xmmword ptr [decoder.rgb32+edi*4+0], xmm0
|
||||||
movaps xmmword ptr [rgb32+edi*4+16], xmm1
|
movaps xmmword ptr [decoder.rgb32+edi*4+16], xmm1
|
||||||
movaps xmmword ptr [rgb32+edi*4+32], xmm3
|
movaps xmmword ptr [decoder.rgb32+edi*4+32], xmm3
|
||||||
movaps xmmword ptr [rgb32+edi*4+48], xmm4
|
movaps xmmword ptr [decoder.rgb32+edi*4+48], xmm4
|
||||||
|
|
||||||
add edi, 16
|
add edi, 16
|
||||||
|
|
||||||
|
@ -255,6 +258,8 @@ ihatemsvc:
|
||||||
// offset to the middle of the sse2 table, so that we can use 1-byte address displacement
|
// offset to the middle of the sse2 table, so that we can use 1-byte address displacement
|
||||||
// to access all fields:
|
// to access all fields:
|
||||||
static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64;
|
static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64;
|
||||||
|
static const macroblock_8* mb8 = (u8*)decoder.mb8;
|
||||||
|
static macroblock_rgb32* rgb32 = (u8*)decoder.rgb32;
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
__asm__ __volatile__ (
|
||||||
".intel_syntax noprefix\n"
|
".intel_syntax noprefix\n"
|
||||||
|
@ -262,15 +267,10 @@ ihatemsvc:
|
||||||
"xor esi, esi\n"
|
"xor esi, esi\n"
|
||||||
"xor edi, edi\n"
|
"xor edi, edi\n"
|
||||||
|
|
||||||
// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
|
|
||||||
// This saves 2-3 bytes per instruction where these are used. :)
|
|
||||||
//"mov ecx, offset %c[yuv2rgb_temp]\n"
|
|
||||||
//"mov edx, offset %c[sse2_tables]+64\n"
|
|
||||||
|
|
||||||
".align 16\n"
|
".align 16\n"
|
||||||
"tworows:\n"
|
"tworows:\n"
|
||||||
"movq xmm3, qword ptr [mb8+256+esi]\n"
|
"movq xmm3, qword ptr [%[mb8]+256+esi]\n"
|
||||||
"movq xmm1, qword ptr [mb8+320+esi]\n"
|
"movq xmm1, qword ptr [%[mb8]+320+esi]\n"
|
||||||
"pxor xmm2, xmm2\n"
|
"pxor xmm2, xmm2\n"
|
||||||
"pxor xmm0, xmm0\n"
|
"pxor xmm0, xmm0\n"
|
||||||
// could skip the movq but punpck requires 128-bit alignment
|
// could skip the movq but punpck requires 128-bit alignment
|
||||||
|
@ -310,7 +310,7 @@ ihatemsvc:
|
||||||
"movaps xmm4, xmm1\n"
|
"movaps xmm4, xmm1\n"
|
||||||
"movaps xmm5, xmm2\n"
|
"movaps xmm5, xmm2\n"
|
||||||
|
|
||||||
"movaps xmm6, xmmword ptr [mb8+edi]\n"
|
"movaps xmm6, xmmword ptr [%[mb8]+edi]\n"
|
||||||
"psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
|
"psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
|
||||||
"movaps xmm7, xmm6\n"
|
"movaps xmm7, xmm6\n"
|
||||||
"psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
|
"psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
|
||||||
|
@ -375,10 +375,10 @@ ihatemsvc:
|
||||||
"punpckhwd xmm4, xmm5\n"
|
"punpckhwd xmm4, xmm5\n"
|
||||||
|
|
||||||
// at last
|
// at last
|
||||||
"movaps xmmword ptr [rgb32+edi*4+0], xmm0\n"
|
"movaps xmmword ptr [%[rgb32]+edi*4+0], xmm0\n"
|
||||||
"movaps xmmword ptr [rgb32+edi*4+16], xmm1\n"
|
"movaps xmmword ptr [%[rgb32]+edi*4+16], xmm1\n"
|
||||||
"movaps xmmword ptr [rgb32+edi*4+32], xmm3\n"
|
"movaps xmmword ptr [%[rgb32]+edi*4+32], xmm3\n"
|
||||||
"movaps xmmword ptr [rgb32+edi*4+48], xmm4\n"
|
"movaps xmmword ptr [%[rgb32]+edi*4+48], xmm4\n"
|
||||||
|
|
||||||
"add edi, 16\n"
|
"add edi, 16\n"
|
||||||
|
|
||||||
|
@ -393,15 +393,11 @@ ihatemsvc:
|
||||||
:[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
|
:[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
|
||||||
[ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
|
[ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
|
||||||
[GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
|
[GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
|
||||||
[yuv2rgb_temp]"r"(yuv2rgb_temp), [sse2_tables]"r"(sse2_tableoffset)
|
[yuv2rgb_temp]"r"(yuv2rgb_temp), [sse2_tables]"r"(sse2_tableoffset),
|
||||||
: "eax", "ebx", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
|
[mb8]"r"(mb8), [rgb32]"r"(rgb32)
|
||||||
|
: "eax", "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
|
||||||
);
|
);
|
||||||
#else
|
#else
|
||||||
# error Unsupported compiler
|
# error Unsupported compiler
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void yuv2rgb_init(void)
|
|
||||||
{
|
|
||||||
/* For later reimplementation of C version */
|
|
||||||
}
|
|
||||||
|
|
|
@ -17,6 +17,5 @@
|
||||||
|
|
||||||
#define yuv2rgb yuv2rgb_sse2
|
#define yuv2rgb yuv2rgb_sse2
|
||||||
|
|
||||||
extern void yuv2rgb_reference(void);
|
extern void yuv2rgb_reference();
|
||||||
extern void yuv2rgb_sse2(void);
|
extern void yuv2rgb_sse2();
|
||||||
extern void yuv2rgb_init(void);
|
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
// the lower 16 bit value. IF the change is breaking of all compatibility with old
|
// the lower 16 bit value. IF the change is breaking of all compatibility with old
|
||||||
// states, increment the upper 16 bit value, and clear the lower 16 bits to 0.
|
// states, increment the upper 16 bit value, and clear the lower 16 bits to 0.
|
||||||
|
|
||||||
static const u32 g_SaveVersion = 0x8b470000;
|
static const u32 g_SaveVersion = 0x8b480000;
|
||||||
|
|
||||||
// this function is meant to be used in the place of GSfreeze, and provides a safe layer
|
// this function is meant to be used in the place of GSfreeze, and provides a safe layer
|
||||||
// between the GS saving function and the MTGS's needs. :)
|
// between the GS saving function and the MTGS's needs. :)
|
||||||
|
|
Loading…
Reference in New Issue