IPU optimizations -- use SSE for FIFO reads/writes, and streamlined IPUdma0 /IPUdma1 feeds a bit.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3771 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-09-15 17:11:24 +00:00
parent 01afac40f6
commit 1c75440a6c
7 changed files with 174 additions and 189 deletions

View File

@ -924,11 +924,11 @@ __fi u8 __fastcall getBits16(u8 *address, u32 advance)
mask = mask | (mask << 8); mask = mask | (mask << 8);
*(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift); *(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift);
} }
else else
{ {
*(u16*)address = *(u16*)readpos; *(u16*)address = *(u16*)readpos;
} }
if (advance) g_BP.BP += 16; if (advance) g_BP.BP += 16;
@ -947,14 +947,14 @@ u8 __fastcall getBits8(u8 *address, u32 advance)
readpos = readbits + (int)g_BP.BP / 8; readpos = readbits + (int)g_BP.BP / 8;
if (uint shift = (g_BP.BP & 7)) if (uint shift = (g_BP.BP & 7))
{ {
mask = (0xff >> shift); mask = (0xff >> shift);
*(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift); *(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift);
} }
else else
{ {
*(u8*)address = *(u8*)readpos; *(u8*)address = *(u8*)readpos;
} }
if (advance) g_BP.BP += 8; if (advance) g_BP.BP += 8;

View File

@ -75,10 +75,7 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
while (transsize-- > 0) while (transsize-- > 0)
{ {
for (int i = 0; i <= 3; i++) CopyQWC(&data[writepos], pMem);
{
data[writepos + i] = pMem[i];
}
writepos = (writepos + 4) & 31; writepos = (writepos + 4) & 31;
pMem += 4; pMem += 4;
} }
@ -86,118 +83,97 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
return firsttrans; return firsttrans;
} }
int IPU_Fifo_Output::write(const u32 *value, int size)
{
int transsize, firsttrans;
if ((int)ipuRegs.ctrl.OFC >= 8) IPU0dma();
transsize = min(size, 8 - (int)ipuRegs.ctrl.OFC);
firsttrans = transsize;
while (transsize-- > 0)
{
for (int i = 0; i <= 3; i++)
{
data[writepos + i] = ((u32*)value)[i];
}
writepos = (writepos + 4) & 31;
value += 4;
}
ipuRegs.ctrl.OFC += firsttrans;
IPU0dma();
return firsttrans;
}
int IPU_Fifo_Input::read(void *value) int IPU_Fifo_Input::read(void *value)
{ {
// wait until enough data to ensure proper streaming. // wait until enough data to ensure proper streaming.
if (g_BP.IFC < 4) if (g_BP.IFC < 1)
{ {
// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO // IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
if(cpuRegs.eCycle[4] == 0x9999) if(cpuRegs.eCycle[4] == 0x9999)
{ IPU1dma();
CPU_INT( DMAC_TO_IPU, 4 );
}
if (g_BP.IFC == 0) return 0; if (g_BP.IFC == 0) return 0;
pxAssert(g_BP.IFC > 0);
} }
// transfer 1 qword, split into two transfers CopyQWC(value, &data[readpos]);
for (int i = 0; i <= 3; i++)
{
((u32*)value)[i] = data[readpos + i];
data[readpos + i] = 0;
}
readpos = (readpos + 4) & 31; readpos = (readpos + 4) & 31;
g_BP.IFC--; g_BP.IFC--;
return 1; return 1;
} }
void IPU_Fifo_Output::_readsingle(void *value) int IPU_Fifo_Output::write(const u32 *value, uint size)
{ {
// transfer 1 qword, split into two transfers pxAssumeMsg(size>0, "Invalid size==0 when calling IPU_Fifo_Output::write");
for (int i = 0; i <= 3; i++)
uint origsize = size;
do {
IPU0dma();
uint transsize = min(size, 8 - ipuRegs.ctrl.OFC);
if(!transsize) break;
ipuRegs.ctrl.OFC = transsize;
size -= transsize;
while (transsize > 0)
{
CopyQWC(&data[writepos], value);
writepos = (writepos + 4) & 31;
value += 4;
--transsize;
}
} while(true);
return origsize - size;
#if 0
if (ipuRegs.ctrl.OFC >= 8) IPU0dma();
uint transsize = min(size, 8 - ipuRegs.ctrl.OFC);
uint firsttrans = transsize;
while (transsize > 0)
{ {
((u32*)value)[i] = data[readpos + i]; CopyQWC(&data[writepos], value);
data[readpos + i] = 0; writepos = (writepos + 4) & 31;
value += 4;
--transsize;
} }
readpos = (readpos + 4) & 31;
ipuRegs.ctrl.OFC += firsttrans;
IPU0dma();
return firsttrans;
#endif
} }
void IPU_Fifo_Output::read(void *value, int size) void IPU_Fifo_Output::read(void *value, uint size)
{ {
pxAssume(ipuRegs.ctrl.OFC >= size);
ipuRegs.ctrl.OFC -= size; ipuRegs.ctrl.OFC -= size;
// Zeroing the read data is not needed, since the ringbuffer design will never read back
// the zero'd data anyway. --air
//__m128 zeroreg = _mm_setzero_ps();
while (size > 0) while (size > 0)
{ {
_readsingle(value); CopyQWC(value, &data[readpos]);
value = (u32*)value + 4; //_mm_store_ps((float*)&data[readpos], zeroreg);
size--;
readpos = (readpos + 4) & 31;
value = (u128*)value + 1;
--size;
} }
} }
void IPU_Fifo_Output::readsingle(void *value)
{
if (ipuRegs.ctrl.OFC > 0)
{
ipuRegs.ctrl.OFC--;
_readsingle(value);
}
}
__fi bool decoder_t::ReadIpuData(u128* out)
{
if(ipu0_data == 0)
{
IPU_LOG( "ReadFIFO/IPUout -> (fifo empty/no data available)" );
return false;
}
CopyQWC(out, GetIpuDataPtr());
--ipu0_data;
++ipu0_idx;
IPU_LOG( "ReadFIFO/IPUout -> %ls", out->ToString().c_str() );
return true;
}
void __fastcall ReadFIFO_IPUout(mem128_t* out) void __fastcall ReadFIFO_IPUout(mem128_t* out)
{ {
// FIXME! When ReadIpuData() doesn't succeed (returns false), the EE should probably stall if (!pxAssertDev( ipuRegs.ctrl.OFC > 0, "Attempted read from IPUout's FIFO, but the FIFO is empty!" )) return;
// until a value becomes available. This isn't exactly easy to do since the virtualized EE ipu_fifo.out.read(out, 1);
// in PCSX2 *has* to be running in order for the IPU DMA to upload new input data to allow
// IPUout's FIFO to fill. Thus if we implement an EE stall, PCSX2 deadlocks. Grr. --air
if (decoder.ReadIpuData(out)) // Games should always check the fifo before reading from it -- so if the FIFO has no data
{ // its either some glitchy game or a bug in pcsx2.
ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
}
} }
void __fastcall WriteFIFO_IPUin(const mem128_t* value) void __fastcall WriteFIFO_IPUin(const mem128_t* value)

View File

@ -37,13 +37,10 @@ struct IPU_Fifo_Output
int readpos, writepos; int readpos, writepos;
// returns number of qw read // returns number of qw read
int write(const u32 * value, int size); int write(const u32 * value, uint size);
void read(void *value,int size); void read(void *value, uint size);
void readsingle(void *value);
void clear(); void clear();
wxString desc() const; wxString desc() const;
void _readsingle(void *value);
}; };
struct IPU_Fifo struct IPU_Fifo

View File

@ -312,8 +312,9 @@ int IPU1dma()
int IPU0dma() int IPU0dma()
{ {
if(!ipuRegs.ctrl.OFC) return 0;
int readsize; int readsize;
static int totalsize = 0;
tDMA_TAG* pMem; tDMA_TAG* pMem;
if ((!(ipu0dma.chcr.STR) || (cpuRegs.interrupt & (1 << DMAC_FROM_IPU))) || (ipu0dma.qwc == 0)) if ((!(ipu0dma.chcr.STR) || (cpuRegs.interrupt & (1 << DMAC_FROM_IPU))) || (ipu0dma.qwc == 0))
@ -329,7 +330,6 @@ int IPU0dma()
pMem = dmaGetAddr(ipu0dma.madr, true); pMem = dmaGetAddr(ipu0dma.madr, true);
readsize = min(ipu0dma.qwc, (u16)ipuRegs.ctrl.OFC); readsize = min(ipu0dma.qwc, (u16)ipuRegs.ctrl.OFC);
totalsize+=readsize;
ipu_fifo.out.read(pMem, readsize); ipu_fifo.out.read(pMem, readsize);
ipu0dma.madr += readsize << 4; ipu0dma.madr += readsize << 4;
@ -363,7 +363,6 @@ int IPU0dma()
//This broke vids in Digital Devil Saga //This broke vids in Digital Devil Saga
//Note that interrupting based on totalsize is just guessing.. //Note that interrupting based on totalsize is just guessing..
IPU_INT_FROM( readsize * BIAS ); IPU_INT_FROM( readsize * BIAS );
totalsize = 0;
} }
return readsize; return readsize;

View File

@ -338,9 +338,6 @@ do { \
static __fi bool get_intra_block() static __fi bool get_intra_block()
{ {
int i;
int j;
int val;
const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm; const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
const u8 (&quant_matrix)[64] = decoder.iq; const u8 (&quant_matrix)[64] = decoder.iq;
int quantizer_scale = decoder.quantizer_scale; int quantizer_scale = decoder.quantizer_scale;
@ -348,7 +345,7 @@ static __fi bool get_intra_block()
u16 code; u16 code;
/* decode AC coefficients */ /* decode AC coefficients */
for (i=1 + ipu_cmd.pos[4]; ; i++) for (int i=1 + ipu_cmd.pos[4]; ; i++)
{ {
switch (ipu_cmd.pos[5]) switch (ipu_cmd.pos[5])
{ {
@ -427,60 +424,64 @@ static __fi bool get_intra_block()
return true; return true;
} }
i+= tab->run == 65 ? GETBITS(6) : tab->run; i += (tab->run == 65) ? GETBITS(6) : tab->run;
if (i >= 64) if (i >= 64)
{ {
ipu_cmd.pos[4] = 0; ipu_cmd.pos[4] = 0;
return true; return true;
} }
case 1: case 1:
if (!GETWORD()) {
{ if (!GETWORD())
ipu_cmd.pos[4] = i - 1; {
ipu_cmd.pos[5] = 1; ipu_cmd.pos[4] = i - 1;
return false; ipu_cmd.pos[5] = 1;
return false;
}
uint j = scan[i];
int val;
if (tab->run==65) /* escape */
{
if(!decoder.mpeg1)
{
val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
DUMPBITS(12);
}
else
{
val = SBITS(8);
DUMPBITS(8);
if (!(val & 0x7f))
{
val = GETBITS(8) + 2 * val;
}
val = (val * quantizer_scale * quant_matrix[i]) >> 4;
val = (val + ~ (((s32)val) >> 31)) | 1;
}
}
else
{
val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
if(decoder.mpeg1)
{
/* oddification */
val = (val - 1) | 1;
}
/* if (bitstream_get (1)) val = -val; */
val = (val ^ SBITS(1)) - SBITS(1);
DUMPBITS(1);
}
SATURATE(val);
dest[j] = val;
ipu_cmd.pos[5] = 0;
} }
j = scan[i];
if (tab->run==65) /* escape */
{
if(!decoder.mpeg1)
{
val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
DUMPBITS(12);
}
else
{
val = SBITS(8);
DUMPBITS(8);
if (!(val & 0x7f))
{
val = GETBITS(8) + 2 * val;
}
val = (val * quantizer_scale * quant_matrix[i]) >> 4;
val = (val + ~ (((s32)val) >> 31)) | 1;
}
}
else
{
val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
if(decoder.mpeg1)
{
/* oddification */
val = (val - 1) | 1;
}
/* if (bitstream_get (1)) val = -val; */
val = (val ^ SBITS(1)) - SBITS(1);
DUMPBITS(1);
}
SATURATE(val);
dest[j] = val;
ipu_cmd.pos[5] = 0;
} }
} }
@ -798,6 +799,9 @@ bool mpeg2sliceIDEC()
ipu_cmd.pos[2] = 6; ipu_cmd.pos[2] = 6;
return false; return false;
} }
break;
jNO_DEFAULT;
} }
// Send The MacroBlock via DmaIpuFrom // Send The MacroBlock via DmaIpuFrom
@ -812,23 +816,23 @@ bool mpeg2sliceIDEC()
} }
case 2: case 2:
while (decoder.ipu0_data > 0) {
{ pxAssume(decoder.ipu0_data > 0);
uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
if (read == 0) uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
{ decoder.AdvanceIpuDataBy(read);
ipu_cmd.pos[1] = 2;
return false; if (decoder.ipu0_data != 0)
} {
else // IPU FIFO filled up -- Will have to finish transferring later.
{ ipu_cmd.pos[1] = 2;
decoder.AdvanceIpuDataBy(read); return false;
}
} }
decoder.mbc++; decoder.mbc++;
mbaCount = 0; mbaCount = 0;
}
case 3: case 3:
while (1) while (1)
{ {
@ -886,6 +890,8 @@ bool mpeg2sliceIDEC()
} }
break; break;
jNO_DEFAULT;
} }
ipu_cmd.pos[1] = 0; ipu_cmd.pos[1] = 0;
@ -919,6 +925,8 @@ finish_idec:
BigEndian(ipuRegs.top, ipuRegs.top); BigEndian(ipuRegs.top, ipuRegs.top);
break; break;
jNO_DEFAULT;
} }
return true; return true;
@ -1010,6 +1018,8 @@ bool mpeg2_slice()
return false; return false;
} }
break; break;
jNO_DEFAULT;
} }
ipu_copy(mb8, mb16); ipu_copy(mb8, mb16);
@ -1077,6 +1087,8 @@ bool mpeg2_slice()
} }
} }
break; break;
jNO_DEFAULT;
} }
} }
} }
@ -1101,21 +1113,23 @@ bool mpeg2_slice()
decoder.SetOutputTo(mb16); decoder.SetOutputTo(mb16);
case 3: case 3:
while (decoder.ipu0_data > 0) {
{ pxAssume(decoder.ipu0_data > 0);
uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
if (size == 0) uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
{ decoder.AdvanceIpuDataBy(read);
ipu_cmd.pos[0] = 3;
return false; if (decoder.ipu0_data != 0)
} {
else // IPU FIFO filled up -- Will have to finish transferring later.
{ ipu_cmd.pos[0] = 3;
decoder.AdvanceIpuDataBy(size); return false;
}
} }
decoder.mbc++;
mbaCount = 0;
}
case 4: case 4:
bit8 = 1; bit8 = 1;
if (!getBits8((u8*)&bit8, 0)) if (!getBits8((u8*)&bit8, 0))

View File

@ -148,7 +148,7 @@ struct decoder_t {
macroblock_rgb32 rgb32; macroblock_rgb32 rgb32;
macroblock_rgb16 rgb16; macroblock_rgb16 rgb16;
uint ipu0_data; uint ipu0_data; // amount of data in the output macroblock (in QWC)
uint ipu0_idx; uint ipu0_idx;
/* bit parsing stuff */ /* bit parsing stuff */
@ -230,7 +230,7 @@ struct decoder_t {
ipu0_data -= amt; ipu0_data -= amt;
} }
bool ReadIpuData(u128* out); __fi bool ReadIpuData(u128* out);
}; };
struct mpeg2_scan_pack struct mpeg2_scan_pack

View File

@ -36,23 +36,22 @@
static __fi int GETWORD() static __fi int GETWORD()
{ {
if (decoder.bitstream_bits <= 0) return 1;
static u8 data[2]; static u8 data[2];
if (decoder.bitstream_bits > 0) if(!getBits16(data,1))
{ {
if(!getBits16(data,1)) return 0;
{
return 0;
}
/*u32 data;
BigEndian(data, *(u32*)word);
decoder.bitstream_buf |= (u64)data << decoder.bitstream_bits;
decoder.bitstream_bits -= 32;*/
decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
decoder.bitstream_bits -= 16;
} }
/*u32 data;
BigEndian(data, *(u32*)word);
decoder.bitstream_buf |= (u64)data << decoder.bitstream_bits;
decoder.bitstream_bits -= 32;*/
decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
decoder.bitstream_bits -= 16;
return 1; return 1;
} }