IPU optimizations -- use SSE for FIFO reads/writes, and streamlined IPUdma0 /IPUdma1 feeds a bit.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3771 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-09-15 17:11:24 +00:00
parent 01afac40f6
commit 1c75440a6c
7 changed files with 174 additions and 189 deletions

View File

@ -924,11 +924,11 @@ __fi u8 __fastcall getBits16(u8 *address, u32 advance)
mask = mask | (mask << 8);
*(u16*)address = ((~mask & *(u16*)(readpos + 1)) >> (8 - shift)) | (((mask) & *(u16*)readpos) << shift);
}
}
else
{
*(u16*)address = *(u16*)readpos;
}
}
if (advance) g_BP.BP += 16;
@ -947,14 +947,14 @@ u8 __fastcall getBits8(u8 *address, u32 advance)
readpos = readbits + (int)g_BP.BP / 8;
if (uint shift = (g_BP.BP & 7))
{
{
mask = (0xff >> shift);
*(u8*)address = (((~mask) & readpos[1]) >> (8 - shift)) | (((mask) & *readpos) << shift);
}
}
else
{
*(u8*)address = *(u8*)readpos;
}
}
if (advance) g_BP.BP += 8;

View File

@ -75,10 +75,7 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
while (transsize-- > 0)
{
for (int i = 0; i <= 3; i++)
{
data[writepos + i] = pMem[i];
}
CopyQWC(&data[writepos], pMem);
writepos = (writepos + 4) & 31;
pMem += 4;
}
@ -86,118 +83,97 @@ int IPU_Fifo_Input::write(u32* pMem, int size)
return firsttrans;
}
int IPU_Fifo_Output::write(const u32 *value, int size)
{
int transsize, firsttrans;
if ((int)ipuRegs.ctrl.OFC >= 8) IPU0dma();
transsize = min(size, 8 - (int)ipuRegs.ctrl.OFC);
firsttrans = transsize;
while (transsize-- > 0)
{
for (int i = 0; i <= 3; i++)
{
data[writepos + i] = ((u32*)value)[i];
}
writepos = (writepos + 4) & 31;
value += 4;
}
ipuRegs.ctrl.OFC += firsttrans;
IPU0dma();
return firsttrans;
}
int IPU_Fifo_Input::read(void *value)
{
// wait until enough data to ensure proper streaming.
if (g_BP.IFC < 4)
if (g_BP.IFC < 1)
{
// IPU FIFO is empty and DMA is waiting so lets tell the DMA we are ready to put data in the FIFO
if(cpuRegs.eCycle[4] == 0x9999)
{
CPU_INT( DMAC_TO_IPU, 4 );
}
IPU1dma();
if (g_BP.IFC == 0) return 0;
pxAssert(g_BP.IFC > 0);
}
// transfer 1 qword, split into two transfers
for (int i = 0; i <= 3; i++)
{
((u32*)value)[i] = data[readpos + i];
data[readpos + i] = 0;
}
CopyQWC(value, &data[readpos]);
readpos = (readpos + 4) & 31;
g_BP.IFC--;
return 1;
}
void IPU_Fifo_Output::_readsingle(void *value)
int IPU_Fifo_Output::write(const u32 *value, uint size)
{
// transfer 1 qword, split into two transfers
for (int i = 0; i <= 3; i++)
pxAssumeMsg(size>0, "Invalid size==0 when calling IPU_Fifo_Output::write");
uint origsize = size;
do {
IPU0dma();
uint transsize = min(size, 8 - ipuRegs.ctrl.OFC);
if(!transsize) break;
ipuRegs.ctrl.OFC = transsize;
size -= transsize;
while (transsize > 0)
{
CopyQWC(&data[writepos], value);
writepos = (writepos + 4) & 31;
value += 4;
--transsize;
}
} while(true);
return origsize - size;
#if 0
if (ipuRegs.ctrl.OFC >= 8) IPU0dma();
uint transsize = min(size, 8 - ipuRegs.ctrl.OFC);
uint firsttrans = transsize;
while (transsize > 0)
{
((u32*)value)[i] = data[readpos + i];
data[readpos + i] = 0;
CopyQWC(&data[writepos], value);
writepos = (writepos + 4) & 31;
value += 4;
--transsize;
}
readpos = (readpos + 4) & 31;
ipuRegs.ctrl.OFC += firsttrans;
IPU0dma();
return firsttrans;
#endif
}
void IPU_Fifo_Output::read(void *value, int size)
void IPU_Fifo_Output::read(void *value, uint size)
{
pxAssume(ipuRegs.ctrl.OFC >= size);
ipuRegs.ctrl.OFC -= size;
// Zeroing the read data is not needed, since the ringbuffer design will never read back
// the zero'd data anyway. --air
//__m128 zeroreg = _mm_setzero_ps();
while (size > 0)
{
_readsingle(value);
value = (u32*)value + 4;
size--;
CopyQWC(value, &data[readpos]);
//_mm_store_ps((float*)&data[readpos], zeroreg);
readpos = (readpos + 4) & 31;
value = (u128*)value + 1;
--size;
}
}
void IPU_Fifo_Output::readsingle(void *value)
{
if (ipuRegs.ctrl.OFC > 0)
{
ipuRegs.ctrl.OFC--;
_readsingle(value);
}
}
__fi bool decoder_t::ReadIpuData(u128* out)
{
if(ipu0_data == 0)
{
IPU_LOG( "ReadFIFO/IPUout -> (fifo empty/no data available)" );
return false;
}
CopyQWC(out, GetIpuDataPtr());
--ipu0_data;
++ipu0_idx;
IPU_LOG( "ReadFIFO/IPUout -> %ls", out->ToString().c_str() );
return true;
}
void __fastcall ReadFIFO_IPUout(mem128_t* out)
{
// FIXME! When ReadIpuData() doesn't succeed (returns false), the EE should probably stall
// until a value becomes available. This isn't exactly easy to do since the virtualized EE
// in PCSX2 *has* to be running in order for the IPU DMA to upload new input data to allow
// IPUout's FIFO to fill. Thus if we implement an EE stall, PCSX2 deadlocks. Grr. --air
if (!pxAssertDev( ipuRegs.ctrl.OFC > 0, "Attempted read from IPUout's FIFO, but the FIFO is empty!" )) return;
ipu_fifo.out.read(out, 1);
if (decoder.ReadIpuData(out))
{
ipu_fifo.out.readpos = (ipu_fifo.out.readpos + 4) & 31;
}
// Games should always check the fifo before reading from it -- so if the FIFO has no data
// its either some glitchy game or a bug in pcsx2.
}
void __fastcall WriteFIFO_IPUin(const mem128_t* value)

View File

@ -37,13 +37,10 @@ struct IPU_Fifo_Output
int readpos, writepos;
// returns number of qw read
int write(const u32 * value, int size);
void read(void *value,int size);
void readsingle(void *value);
int write(const u32 * value, uint size);
void read(void *value, uint size);
void clear();
wxString desc() const;
void _readsingle(void *value);
};
struct IPU_Fifo

View File

@ -312,8 +312,9 @@ int IPU1dma()
int IPU0dma()
{
if(!ipuRegs.ctrl.OFC) return 0;
int readsize;
static int totalsize = 0;
tDMA_TAG* pMem;
if ((!(ipu0dma.chcr.STR) || (cpuRegs.interrupt & (1 << DMAC_FROM_IPU))) || (ipu0dma.qwc == 0))
@ -329,7 +330,6 @@ int IPU0dma()
pMem = dmaGetAddr(ipu0dma.madr, true);
readsize = min(ipu0dma.qwc, (u16)ipuRegs.ctrl.OFC);
totalsize+=readsize;
ipu_fifo.out.read(pMem, readsize);
ipu0dma.madr += readsize << 4;
@ -363,7 +363,6 @@ int IPU0dma()
//This broke vids in Digital Devil Saga
//Note that interrupting based on totalsize is just guessing..
IPU_INT_FROM( readsize * BIAS );
totalsize = 0;
}
return readsize;

View File

@ -338,9 +338,6 @@ do { \
static __fi bool get_intra_block()
{
int i;
int j;
int val;
const u8 * scan = decoder.scantype ? mpeg2_scan.alt : mpeg2_scan.norm;
const u8 (&quant_matrix)[64] = decoder.iq;
int quantizer_scale = decoder.quantizer_scale;
@ -348,7 +345,7 @@ static __fi bool get_intra_block()
u16 code;
/* decode AC coefficients */
for (i=1 + ipu_cmd.pos[4]; ; i++)
for (int i=1 + ipu_cmd.pos[4]; ; i++)
{
switch (ipu_cmd.pos[5])
{
@ -427,60 +424,64 @@ static __fi bool get_intra_block()
return true;
}
i+= tab->run == 65 ? GETBITS(6) : tab->run;
i += (tab->run == 65) ? GETBITS(6) : tab->run;
if (i >= 64)
{
ipu_cmd.pos[4] = 0;
return true;
}
case 1:
if (!GETWORD())
{
ipu_cmd.pos[4] = i - 1;
ipu_cmd.pos[5] = 1;
return false;
{
if (!GETWORD())
{
ipu_cmd.pos[4] = i - 1;
ipu_cmd.pos[5] = 1;
return false;
}
uint j = scan[i];
int val;
if (tab->run==65) /* escape */
{
if(!decoder.mpeg1)
{
val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
DUMPBITS(12);
}
else
{
val = SBITS(8);
DUMPBITS(8);
if (!(val & 0x7f))
{
val = GETBITS(8) + 2 * val;
}
val = (val * quantizer_scale * quant_matrix[i]) >> 4;
val = (val + ~ (((s32)val) >> 31)) | 1;
}
}
else
{
val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
if(decoder.mpeg1)
{
/* oddification */
val = (val - 1) | 1;
}
/* if (bitstream_get (1)) val = -val; */
val = (val ^ SBITS(1)) - SBITS(1);
DUMPBITS(1);
}
SATURATE(val);
dest[j] = val;
ipu_cmd.pos[5] = 0;
}
j = scan[i];
if (tab->run==65) /* escape */
{
if(!decoder.mpeg1)
{
val = (SBITS(12) * quantizer_scale * quant_matrix[i]) >> 4;
DUMPBITS(12);
}
else
{
val = SBITS(8);
DUMPBITS(8);
if (!(val & 0x7f))
{
val = GETBITS(8) + 2 * val;
}
val = (val * quantizer_scale * quant_matrix[i]) >> 4;
val = (val + ~ (((s32)val) >> 31)) | 1;
}
}
else
{
val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
if(decoder.mpeg1)
{
/* oddification */
val = (val - 1) | 1;
}
/* if (bitstream_get (1)) val = -val; */
val = (val ^ SBITS(1)) - SBITS(1);
DUMPBITS(1);
}
SATURATE(val);
dest[j] = val;
ipu_cmd.pos[5] = 0;
}
}
@ -798,6 +799,9 @@ bool mpeg2sliceIDEC()
ipu_cmd.pos[2] = 6;
return false;
}
break;
jNO_DEFAULT;
}
// Send The MacroBlock via DmaIpuFrom
@ -812,23 +816,23 @@ bool mpeg2sliceIDEC()
}
case 2:
while (decoder.ipu0_data > 0)
{
uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
{
pxAssume(decoder.ipu0_data > 0);
if (read == 0)
{
ipu_cmd.pos[1] = 2;
return false;
}
else
{
decoder.AdvanceIpuDataBy(read);
}
uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
decoder.AdvanceIpuDataBy(read);
if (decoder.ipu0_data != 0)
{
// IPU FIFO filled up -- Will have to finish transferring later.
ipu_cmd.pos[1] = 2;
return false;
}
decoder.mbc++;
mbaCount = 0;
}
case 3:
while (1)
{
@ -886,6 +890,8 @@ bool mpeg2sliceIDEC()
}
break;
jNO_DEFAULT;
}
ipu_cmd.pos[1] = 0;
@ -919,6 +925,8 @@ finish_idec:
BigEndian(ipuRegs.top, ipuRegs.top);
break;
jNO_DEFAULT;
}
return true;
@ -1010,6 +1018,8 @@ bool mpeg2_slice()
return false;
}
break;
jNO_DEFAULT;
}
ipu_copy(mb8, mb16);
@ -1077,6 +1087,8 @@ bool mpeg2_slice()
}
}
break;
jNO_DEFAULT;
}
}
}
@ -1101,21 +1113,23 @@ bool mpeg2_slice()
decoder.SetOutputTo(mb16);
case 3:
while (decoder.ipu0_data > 0)
{
uint size = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
{
pxAssume(decoder.ipu0_data > 0);
if (size == 0)
{
ipu_cmd.pos[0] = 3;
return false;
}
else
{
decoder.AdvanceIpuDataBy(size);
}
uint read = ipu_fifo.out.write((u32*)decoder.GetIpuDataPtr(), decoder.ipu0_data);
decoder.AdvanceIpuDataBy(read);
if (decoder.ipu0_data != 0)
{
// IPU FIFO filled up -- Will have to finish transferring later.
ipu_cmd.pos[0] = 3;
return false;
}
decoder.mbc++;
mbaCount = 0;
}
case 4:
bit8 = 1;
if (!getBits8((u8*)&bit8, 0))

View File

@ -148,7 +148,7 @@ struct decoder_t {
macroblock_rgb32 rgb32;
macroblock_rgb16 rgb16;
uint ipu0_data;
uint ipu0_data; // amount of data in the output macroblock (in QWC)
uint ipu0_idx;
/* bit parsing stuff */
@ -230,7 +230,7 @@ struct decoder_t {
ipu0_data -= amt;
}
bool ReadIpuData(u128* out);
__fi bool ReadIpuData(u128* out);
};
struct mpeg2_scan_pack

View File

@ -36,22 +36,21 @@
static __fi int GETWORD()
{
static u8 data[2];
if (decoder.bitstream_bits <= 0) return 1;
if (decoder.bitstream_bits > 0)
static u8 data[2];
if(!getBits16(data,1))
{
if(!getBits16(data,1))
{
return 0;
}
/*u32 data;
BigEndian(data, *(u32*)word);
decoder.bitstream_buf |= (u64)data << decoder.bitstream_bits;
decoder.bitstream_bits -= 32;*/
decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
decoder.bitstream_bits -= 16;
return 0;
}
/*u32 data;
BigEndian(data, *(u32*)word);
decoder.bitstream_buf |= (u64)data << decoder.bitstream_bits;
decoder.bitstream_bits -= 32;*/
decoder.bitstream_buf |= (((u32)data[0] << 8) | data[1]) << decoder.bitstream_bits;
decoder.bitstream_bits -= 16;
return 1;
}