mirror of https://github.com/PCSX2/pcsx2.git
Merge pull request #215 from xsacha/memcpy
Remove some slow, redundant memcpy implementations: memcpy_const/memcpy_...
This commit is contained in:
commit
98d22f8b2e
|
@ -118,7 +118,7 @@ template< typename T >
|
|||
SafeArray<T>* SafeArray<T>::Clone() const
|
||||
{
|
||||
SafeArray<T>* retval = new SafeArray<T>( m_size );
|
||||
memcpy_fast( retval->GetPtr(), m_ptr, sizeof(T) * m_size );
|
||||
memcpy( retval->GetPtr(), m_ptr, sizeof(T) * m_size );
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -160,7 +160,7 @@ template< typename T, uint Alignment >
|
|||
SafeAlignedArray<T,Alignment>* SafeAlignedArray<T,Alignment>::Clone() const
|
||||
{
|
||||
SafeAlignedArray<T,Alignment>* retval = new SafeAlignedArray<T,Alignment>( this->m_size );
|
||||
memcpy_fast( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size );
|
||||
memcpy( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size );
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -272,14 +272,14 @@ void SafeList<T>::Remove( int index )
|
|||
|
||||
int copylen = m_length - index;
|
||||
if( copylen > 0 )
|
||||
memcpy_fast( &m_ptr[index], &m_ptr[index+1], copylen );
|
||||
memcpy( &m_ptr[index], &m_ptr[index+1], copylen );
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
SafeList<T>* SafeList<T>::Clone() const
|
||||
{
|
||||
SafeList<T>* retval = new SafeList<T>( m_length );
|
||||
memcpy_fast( retval->m_ptr, m_ptr, sizeof(T) * m_length );
|
||||
memcpy( retval->m_ptr, m_ptr, sizeof(T) * m_length );
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
|
|
@ -265,7 +265,7 @@ FastFormatUnicode& FastFormatUnicode::WriteV( const char* fmt, va_list argptr )
|
|||
const uint inspos = m_Length;
|
||||
const uint convLen = converted.Length();
|
||||
m_dest->MakeRoomFor((inspos + convLen + 64) * sizeof(wxChar));
|
||||
memcpy_fast( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) );
|
||||
memcpy( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) );
|
||||
m_Length += convLen;
|
||||
|
||||
return *this;
|
||||
|
|
|
@ -60,7 +60,7 @@ void xSmartJump::SetTarget()
|
|||
u8* destpos = xGetPtr();
|
||||
const int copylen = (sptr)target - (sptr)saveme;
|
||||
|
||||
memcpy_fast( destpos, saveme, copylen );
|
||||
memcpy( destpos, saveme, copylen );
|
||||
xSetPtr( target - spacer );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -725,7 +725,7 @@ int cdvdReadSector() {
|
|||
mdest[11] = 0;
|
||||
|
||||
// normal 2048 bytes of sector data
|
||||
memcpy_const(&mdest[12], cdr.Transfer, 2048);
|
||||
memcpy(&mdest[12], cdr.Transfer, 2048);
|
||||
|
||||
// 4 bytes of edc (not calculated at present)
|
||||
mdest[2060] = 0;
|
||||
|
@ -735,7 +735,7 @@ int cdvdReadSector() {
|
|||
}
|
||||
else
|
||||
{
|
||||
memcpy_fast( mdest, cdr.Transfer, cdvd.BlockSize);
|
||||
memcpy( mdest, cdr.Transfer, cdvd.BlockSize);
|
||||
}
|
||||
|
||||
// decrypt sector's bytes
|
||||
|
@ -1567,7 +1567,7 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
|
|||
cdvd.Param[cdvd.ParamP-5], cdvd.Param[cdvd.ParamP-3], cdvd.Param[cdvd.ParamP-2], cdvd.Param[cdvd.ParamP-1]);
|
||||
Console.WriteLn("RTC Write Sec %d Min %d Hr %d Day %d Month %d Year %d", cdvd.RTC.second, cdvd.RTC.minute,
|
||||
cdvd.RTC.hour, cdvd.RTC.day, cdvd.RTC.month, cdvd.RTC.year);*/
|
||||
//memcpy_fast((u8*)&cdvd.RTC, cdvd.Param, 7);
|
||||
//memcpy((u8*)&cdvd.RTC, cdvd.Param, 7);
|
||||
break;
|
||||
|
||||
case 0x0A: // sceCdReadNVM (2:3)
|
||||
|
@ -1907,7 +1907,7 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
|
|||
}
|
||||
else
|
||||
{
|
||||
memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
|
||||
memcpy(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
|
||||
cdvd.mg_size += cdvd.ParamC;
|
||||
cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
|
||||
}
|
||||
|
@ -1915,9 +1915,9 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
|
|||
|
||||
case 0x8E: // sceMgReadData
|
||||
SetResultSize( std::min(16, cdvd.mg_size) );
|
||||
memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
|
||||
memcpy(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
|
||||
cdvd.mg_size -= cdvd.ResultC;
|
||||
memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
|
||||
memcpy(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
|
||||
break;
|
||||
|
||||
case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;)
|
||||
|
@ -1984,7 +1984,7 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
|
|||
{
|
||||
SetResultSize(3);//in:0
|
||||
int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
|
||||
memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
|
||||
memcpy(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
|
||||
|
||||
cdvd.mg_maxsize = 0; // don't allow any write
|
||||
cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
|
||||
|
|
|
@ -422,7 +422,7 @@ s32 CALLBACK ISOreadSector(u8* tempbuffer, u32 lsn, int mode)
|
|||
jNO_DEFAULT
|
||||
}
|
||||
|
||||
memcpy_fast(tempbuffer, pbuffer, psize);
|
||||
memcpy(tempbuffer, pbuffer, psize);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -917,7 +917,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
|
|||
}
|
||||
|
||||
cdsize = (bcr & 0xffff) * 4;
|
||||
memcpy_fast(iopPhysMem(madr), cdr.pTransfer, cdsize);
|
||||
memcpy(iopPhysMem(madr), cdr.pTransfer, cdsize);
|
||||
psxCpu->Clear(madr, cdsize/4);
|
||||
cdr.pTransfer+=cdsize;
|
||||
|
||||
|
@ -947,7 +947,7 @@ s32 CALLBACK cdvdDmaRead(s32 channel, u32* data, u32 bytesLeft, u32* bytesProces
|
|||
return 10000;
|
||||
}
|
||||
|
||||
memcpy_fast(data, cdr.pTransfer, wordsLeft);
|
||||
memcpy(data, cdr.pTransfer, wordsLeft);
|
||||
//psxCpu->Clear(madr, cdsize/4);
|
||||
cdr.pTransfer+=wordsLeft;
|
||||
*wordsProcessed = wordsLeft;
|
||||
|
|
|
@ -145,7 +145,7 @@ int InputIsoFile::FinishRead3(u8* dst, uint mode)
|
|||
length = end - _offset;
|
||||
|
||||
uint read_offset = (m_current_lsn - m_read_lsn) * m_blocksize;
|
||||
memcpy_fast(dst + diff, m_readbuffer + ndiff + read_offset, length);
|
||||
memcpy(dst + diff, m_readbuffer + ndiff + read_offset, length);
|
||||
|
||||
if (m_type == ISOTYPE_CD && diff >= 12)
|
||||
{
|
||||
|
|
|
@ -161,7 +161,7 @@ int IsoFile::internalRead(void* dest, int off, int len)
|
|||
slen = (int) (maxOffset - currentOffset);
|
||||
}
|
||||
|
||||
memcpy_fast((u8*)dest + off, currentSector + sectorOffset, slen);
|
||||
memcpy((u8*)dest + off, currentSector + sectorOffset, slen);
|
||||
|
||||
sectorOffset += slen;
|
||||
currentOffset += slen;
|
||||
|
|
12
pcsx2/GS.h
12
pcsx2/GS.h
|
@ -442,27 +442,27 @@ extern __aligned(32) MTGS_BufferedData RingBuffer;
|
|||
inline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) {
|
||||
uint endpos = destStart + len;
|
||||
if ( endpos < destSize ) {
|
||||
memcpy_qwc(&destBase[destStart], src, len );
|
||||
memcpy(&destBase[destStart], src, len*16);
|
||||
destStart += len;
|
||||
}
|
||||
else {
|
||||
uint firstcopylen = destSize - destStart;
|
||||
memcpy_qwc(&destBase[destStart], src, firstcopylen );
|
||||
memcpy(&destBase[destStart], src, firstcopylen*16);
|
||||
destStart = endpos % destSize;
|
||||
memcpy_qwc(destBase, src+firstcopylen, destStart );
|
||||
memcpy(destBase, src+firstcopylen, destStart*16);
|
||||
}
|
||||
}
|
||||
|
||||
inline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) {
|
||||
uint endpos = srcStart + len;
|
||||
if ( endpos < srcSize ) {
|
||||
memcpy_qwc(dest, &srcBase[srcStart], len );
|
||||
memcpy(dest, &srcBase[srcStart], len*16);
|
||||
srcStart += len;
|
||||
}
|
||||
else {
|
||||
uint firstcopylen = srcSize - srcStart;
|
||||
memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
|
||||
memcpy(dest, &srcBase[srcStart], firstcopylen*16);
|
||||
srcStart = endpos % srcSize;
|
||||
memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
|
||||
memcpy(dest+firstcopylen, srcBase, srcStart*16);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -222,7 +222,7 @@ struct Gif_Path {
|
|||
}
|
||||
//DevCon.WriteLn("Realign Packet [%d]", curSize - offset);
|
||||
if (intersect) memmove(buffer, &buffer[offset], curSize - offset);
|
||||
else memcpy_fast(buffer, &buffer[offset], curSize - offset);
|
||||
else memcpy(buffer, &buffer[offset], curSize - offset);
|
||||
curSize -= offset;
|
||||
curOffset = gsPack.size;
|
||||
gsPack.offset = 0;
|
||||
|
@ -241,8 +241,7 @@ struct Gif_Path {
|
|||
mtgsReadWait(); // Let MTGS run to free up buffer space
|
||||
}
|
||||
pxAssertDev(curSize+size<=buffSize, "Gif Path Buffer Overflow!");
|
||||
if (aligned) memcpy_qwc (&buffer[curSize], pMem, size/16);
|
||||
else memcpy_fast(&buffer[curSize], pMem, size);
|
||||
memcpy (&buffer[curSize], pMem, size);
|
||||
curSize += size;
|
||||
}
|
||||
|
||||
|
|
|
@ -181,7 +181,7 @@ void SysMtgsThread::OpenPlugin()
|
|||
{
|
||||
if( m_PluginOpened ) return;
|
||||
|
||||
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
|
||||
memcpy( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
|
||||
GSsetBaseMem( RingBuffer.Regs );
|
||||
GSirqCallback( dummyIrqCallback );
|
||||
|
||||
|
@ -626,7 +626,7 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
|
|||
if (syncRegs) {
|
||||
ScopedLock lock(m_mtx_WaitGS);
|
||||
// Completely synchronize GS and MTGS register states.
|
||||
memcpy_fast(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
|
||||
memcpy(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -217,7 +217,7 @@ __fi u32 VU_Thread::Read()
|
|||
|
||||
__fi void VU_Thread::Read(void* dest, u32 size)
|
||||
{
|
||||
memcpy_fast(dest, &buffer[read_pos], size);
|
||||
memcpy(dest, &buffer[read_pos], size);
|
||||
incReadPos(size_u32(size));
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ __fi void VU_Thread::Write(u32 val)
|
|||
}
|
||||
__fi void VU_Thread::Write(void* src, u32 size)
|
||||
{
|
||||
memcpy_fast(GetWritePtr(), src, size);
|
||||
memcpy(GetWritePtr(), src, size);
|
||||
write_offset += size_u32(size);
|
||||
}
|
||||
|
||||
|
|
|
@ -220,7 +220,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
|
|||
// the transfer is most likely wrapped/partial. We need to queue it into a linear buffer
|
||||
// and then send it on its way on the next copy.
|
||||
|
||||
memcpy_qwc( path1queue, src128, data );
|
||||
memcpy( path1queue, src128, data*16);
|
||||
path1size = data;
|
||||
}
|
||||
else
|
||||
|
@ -235,7 +235,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
|
|||
if (src128 == RingBuffer.m_Ring)
|
||||
{
|
||||
pxAssert( (data+path1size) <= 0x400 );
|
||||
memcpy_qwc( &path1queue[path1size], src128, data );
|
||||
memcpy( &path1queue[path1size], src128, data*16);
|
||||
path1size += data;
|
||||
}
|
||||
GSgifTransfer1( (u32*)path1queue, 0 );
|
||||
|
@ -455,7 +455,7 @@ static s32 CALLBACK CDVD_getBuffer2(u8* buffer)
|
|||
u8* pb = CDVD->getBuffer();
|
||||
if(pb == NULL) return -2;
|
||||
|
||||
memcpy_fast( buffer, pb, lastReadSize );
|
||||
memcpy( buffer, pb, lastReadSize );
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -216,7 +216,7 @@ static int __Deci2Call(int call, u32 *addr)
|
|||
pdeciaddr += (d2ptr[4]+0xc) % 16;
|
||||
|
||||
const int copylen = std::min<uint>(255, d2ptr[1]-0xc);
|
||||
memcpy_fast(deci2buffer, pdeciaddr, copylen );
|
||||
memcpy(deci2buffer, pdeciaddr, copylen );
|
||||
deci2buffer[copylen] = '\0';
|
||||
|
||||
eeConLog( ShiftJIS_ConvertString(deci2buffer) );
|
||||
|
|
|
@ -97,7 +97,7 @@ int _SPR0chain()
|
|||
//Taking an arbitary small value for games which like to check the QWC/MADR instead of STR, so get most of
|
||||
//the cycle delay out of the way before the end.
|
||||
partialqwc = spr0ch.qwc;
|
||||
memcpy_qwc(pMem, &psSu128(spr0ch.sadr), partialqwc);
|
||||
memcpy(pMem, &psSu128(spr0ch.sadr), partialqwc*16);
|
||||
|
||||
// clear VU mem also!
|
||||
TestClearVUs(spr0ch.madr, partialqwc, true);
|
||||
|
@ -151,7 +151,7 @@ void _SPR0interleave()
|
|||
case MFD_RESERVED:
|
||||
// clear VU mem also!
|
||||
TestClearVUs(spr0ch.madr, spr0ch.qwc, true);
|
||||
memcpy_qwc(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc);
|
||||
memcpy(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc*16);
|
||||
break;
|
||||
}
|
||||
spr0ch.sadr += spr0ch.qwc * 16;
|
||||
|
@ -322,7 +322,7 @@ __fi static void SPR1transfer(const void* data, int qwc)
|
|||
TestClearVUs(spr1ch.madr, spr1ch.qwc, false);
|
||||
}
|
||||
|
||||
memcpy_qwc(&psSu128(spr1ch.sadr), data, qwc);
|
||||
memcpy(&psSu128(spr1ch.sadr), data, qwc*16);
|
||||
spr1ch.sadr += qwc * 16;
|
||||
}
|
||||
|
||||
|
@ -381,7 +381,7 @@ void _SPR1interleave()
|
|||
spr1ch.qwc = std::min(tqwc, qwc);
|
||||
qwc -= spr1ch.qwc;
|
||||
pMem = SPRdmaGetAddr(spr1ch.madr, false);
|
||||
memcpy_qwc(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc);
|
||||
memcpy(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc*16);
|
||||
spr1ch.sadr += spr1ch.qwc * 16;
|
||||
spr1ch.madr += (sqwc + spr1ch.qwc) * 16;
|
||||
}
|
||||
|
|
|
@ -126,7 +126,7 @@ SaveStateBase& SaveStateBase::FreezeBios()
|
|||
pxToUTF8 utf8(BiosDescription);
|
||||
|
||||
memzero( biosdesc );
|
||||
memcpy_fast( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) );
|
||||
memcpy( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) );
|
||||
|
||||
Freeze( bioscheck );
|
||||
Freeze( biosdesc );
|
||||
|
@ -282,7 +282,7 @@ void memSavingState::FreezeMem( void* data, int size )
|
|||
if (!size) return;
|
||||
|
||||
m_memory->MakeRoomFor( m_idx + size );
|
||||
memcpy_fast( m_memory->GetPtr(m_idx), data, size );
|
||||
memcpy( m_memory->GetPtr(m_idx), data, size );
|
||||
m_idx += size;
|
||||
}
|
||||
|
||||
|
@ -322,7 +322,7 @@ void memLoadingState::FreezeMem( void* data, int size )
|
|||
{
|
||||
const u8* const src = m_memory->GetPtr(m_idx);
|
||||
m_idx += size;
|
||||
memcpy_fast( data, src, size );
|
||||
memcpy( data, src, size );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------
|
||||
|
|
|
@ -53,8 +53,8 @@ struct sifFifo
|
|||
const int wP0 = std::min((FIFO_SIF_W - writePos), words);
|
||||
const int wP1 = words - wP0;
|
||||
|
||||
memcpy_fast(&data[writePos], from, wP0 << 2);
|
||||
memcpy_fast(&data[0], &from[wP0], wP1 << 2);
|
||||
memcpy(&data[writePos], from, wP0 << 2);
|
||||
memcpy(&data[0], &from[wP0], wP1 << 2);
|
||||
|
||||
writePos = (writePos + words) & (FIFO_SIF_W - 1);
|
||||
size += words;
|
||||
|
@ -69,8 +69,8 @@ struct sifFifo
|
|||
const int wP0 = std::min((FIFO_SIF_W - readPos), words);
|
||||
const int wP1 = words - wP0;
|
||||
|
||||
memcpy_fast(to, &data[readPos], wP0 << 2);
|
||||
memcpy_fast(&to[wP0], &data[0], wP1 << 2);
|
||||
memcpy(to, &data[readPos], wP0 << 2);
|
||||
memcpy(&to[wP0], &data[0], wP1 << 2);
|
||||
|
||||
readPos = (readPos + words) & (FIFO_SIF_W - 1);
|
||||
size -= words;
|
||||
|
|
|
@ -319,7 +319,7 @@ SIO_WRITE memcardErase(u8 data)
|
|||
{
|
||||
case 0x82: // Erase
|
||||
//siomode = SIO_DUMMY; // Nothing more to do here.
|
||||
memcpy_fast(sio.buf, &header[1], 4);
|
||||
memcpy(sio.buf, &header[1], 4);
|
||||
sio.bufSize = 3;
|
||||
mcd->EraseBlock();
|
||||
break;
|
||||
|
@ -367,7 +367,7 @@ SIO_WRITE memcardWrite(u8 data)
|
|||
switch(data)
|
||||
{
|
||||
case 0x42: // Write
|
||||
memcpy_fast(sio.buf, header, 4);
|
||||
memcpy(sio.buf, header, 4);
|
||||
once = true;
|
||||
break;
|
||||
|
||||
|
@ -375,7 +375,7 @@ SIO_WRITE memcardWrite(u8 data)
|
|||
if(once)
|
||||
{
|
||||
siomode = SIO_DUMMY; // Nothing more to do here.
|
||||
memcpy_fast(sio.buf, &header[1], 4);
|
||||
memcpy(sio.buf, &header[1], 4);
|
||||
sio.bufSize = 3;
|
||||
|
||||
sio2.packet.recvVal1 = 0x1600; // Writing
|
||||
|
@ -454,7 +454,7 @@ SIO_WRITE memcardRead(u8 data)
|
|||
switch(data)
|
||||
{
|
||||
case 0x43: // Read
|
||||
memcpy_fast(sio.buf, header, 4);
|
||||
memcpy(sio.buf, header, 4);
|
||||
once = true;
|
||||
break;
|
||||
|
||||
|
@ -462,7 +462,7 @@ SIO_WRITE memcardRead(u8 data)
|
|||
if(once)
|
||||
{
|
||||
siomode = SIO_DUMMY; // Nothing more to do here.
|
||||
memcpy_fast(sio.buf, &header[1], 4);
|
||||
memcpy(sio.buf, &header[1], 4);
|
||||
sio.bufSize = 3;
|
||||
|
||||
sio2.packet.recvVal1 = 0x1700; // Reading
|
||||
|
@ -624,7 +624,7 @@ SIO_WRITE sioWriteMemcard(u8 data)
|
|||
cmd.mc_xor = info.Xor;
|
||||
cmd.Z = mcd->term;
|
||||
|
||||
memcpy_fast(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag));
|
||||
memcpy(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag));
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -698,7 +698,7 @@ SIO_WRITE sioWriteMemcardPSX(u8 data)
|
|||
{
|
||||
case 0x53: // PSX 'S'tate // haven't seen it happen yet
|
||||
sio.buf[1] = mcd->FLAG;
|
||||
memcpy_fast(&sio.buf[2], memcard_psx, 8);
|
||||
memcpy(&sio.buf[2], memcard_psx, 8);
|
||||
siomode = SIO_DUMMY;
|
||||
break;
|
||||
|
||||
|
|
|
@ -296,9 +296,9 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
|
|||
if (!idx) CpuVU0->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
|
||||
else CpuVU1->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
|
||||
|
||||
memcpy_fast(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr);
|
||||
memcpy(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr);
|
||||
size -= ((idx ? 0x4000 : 0x1000) - addr) / 4;
|
||||
memcpy_fast(VUx.Micro, data, size);
|
||||
memcpy(VUx.Micro, data, size);
|
||||
|
||||
vifX.tag.addr = size * 4;
|
||||
}
|
||||
|
@ -310,7 +310,7 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
|
|||
// Clear VU memory before writing!
|
||||
if (!idx) CpuVU0->Clear(addr, size*4);
|
||||
else CpuVU1->Clear(addr, size*4);
|
||||
memcpy_fast(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy_fast
|
||||
memcpy(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy
|
||||
|
||||
vifX.tag.addr += size * 4;
|
||||
}
|
||||
|
|
|
@ -584,7 +584,7 @@ bool ConsoleLogFrame::Write( ConsoleColors color, const wxString& text )
|
|||
|
||||
int endpos = m_CurQueuePos + text.Length();
|
||||
m_QueueBuffer.MakeRoomFor( endpos + 1 ); // and the null!!
|
||||
memcpy_fast( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() );
|
||||
memcpy( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() );
|
||||
m_CurQueuePos = endpos;
|
||||
|
||||
// this NULL may be overwritten if the next message sent doesn't perform a color change.
|
||||
|
|
|
@ -126,17 +126,17 @@ namespace VU1micro
|
|||
#endif
|
||||
|
||||
runCount++;
|
||||
memcpy_const((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs));
|
||||
memcpy_const((u8*)backVUmem, (u8*)VU1.Mem, 0x4000);
|
||||
memcpy((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs));
|
||||
memcpy((u8*)backVUmem, (u8*)VU1.Mem, 0x4000);
|
||||
|
||||
do { // while loop needed since not always will return finished
|
||||
SuperVUExecuteProgram(VU1.VI[ REG_TPC ].UL & 0x3fff, 1);
|
||||
} while( VU0.VI[ REG_VPU_STAT ].UL&0x100 );
|
||||
|
||||
memcpy_const((u8*)cmpVUregs, (u8*)&VU1, sizeof(VURegs));
|
||||
memcpy_const((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000);
|
||||
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
memcpy((u8*)cmpVUregs, (u8*)&VU1, sizeof(VURegs));
|
||||
memcpy((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000);
|
||||
memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
|
||||
//Currently breaking mVU execution is disabled. Check mVUtestCycles<vuIndex>() in microVU_Compile.inl
|
||||
runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);
|
||||
|
@ -227,8 +227,8 @@ namespace VU1micro
|
|||
if (mVUdebugNow) {
|
||||
|
||||
resetVUrec(1);
|
||||
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
|
||||
runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);
|
||||
|
||||
|
|
|
@ -1016,8 +1016,8 @@ void SaveBranchState()
|
|||
s_psaveInstInfo = g_pCurInstInfo;
|
||||
|
||||
// save all mmx regs
|
||||
memcpy_const(s_saveMMXregs, mmxregs, sizeof(mmxregs));
|
||||
memcpy_const(s_saveXMMregs, xmmregs, sizeof(xmmregs));
|
||||
memcpy(s_saveMMXregs, mmxregs, sizeof(mmxregs));
|
||||
memcpy(s_saveXMMregs, xmmregs, sizeof(xmmregs));
|
||||
}
|
||||
|
||||
void LoadBranchState()
|
||||
|
@ -1031,8 +1031,8 @@ void LoadBranchState()
|
|||
g_pCurInstInfo = s_psaveInstInfo;
|
||||
|
||||
// restore all mmx regs
|
||||
memcpy_const(mmxregs, s_saveMMXregs, sizeof(mmxregs));
|
||||
memcpy_const(xmmregs, s_saveXMMregs, sizeof(xmmregs));
|
||||
memcpy(mmxregs, s_saveMMXregs, sizeof(mmxregs));
|
||||
memcpy(xmmregs, s_saveXMMregs, sizeof(xmmregs));
|
||||
}
|
||||
|
||||
void iFlushCall(int flushtype)
|
||||
|
@ -2179,7 +2179,7 @@ StartRecomp:
|
|||
}
|
||||
}
|
||||
|
||||
memcpy_fast(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc);
|
||||
memcpy(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc);
|
||||
}
|
||||
|
||||
s_pCurBlock->SetFnptr((uptr)recPtr);
|
||||
|
|
|
@ -181,8 +181,8 @@ __ri microProgram* mVUcreateProg(microVU& mVU, int startPC) {
|
|||
|
||||
// Caches Micro Program
|
||||
__ri void mVUcacheProg(microVU& mVU, microProgram& prog) {
|
||||
if (!mVU.index) memcpy_const(prog.data, mVU.regs().Micro, 0x1000);
|
||||
else memcpy_const(prog.data, mVU.regs().Micro, 0x4000);
|
||||
if (!mVU.index) memcpy(prog.data, mVU.regs().Micro, 0x1000);
|
||||
else memcpy(prog.data, mVU.regs().Micro, 0x4000);
|
||||
mVUdumpProg(mVU, prog);
|
||||
}
|
||||
|
||||
|
|
|
@ -92,7 +92,7 @@ public:
|
|||
blockEnd = blockList = newBlock;
|
||||
}
|
||||
|
||||
memcpy_const(&newBlock->block, pBlock, sizeof(microBlock));
|
||||
memcpy(&newBlock->block, pBlock, sizeof(microBlock));
|
||||
thisBlock = &newBlock->block;
|
||||
}
|
||||
return thisBlock;
|
||||
|
|
|
@ -170,7 +170,7 @@ void normBranchCompile(microVU& mVU, u32 branchPC) {
|
|||
}
|
||||
|
||||
void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) {
|
||||
memcpy_const(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
|
||||
memcpy(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
|
||||
mVUsetupBranch(mVU, mFC);
|
||||
mVUbackupRegs(mVU);
|
||||
|
||||
|
@ -386,7 +386,7 @@ void condBranch(mV, microFlagCycles& mFC, int JMPcc) {
|
|||
s32* ajmp = xJcc32((JccComparisonType)JMPcc);
|
||||
u32 bPC = iPC; // mVUcompile can modify iPC, mVUpBlock, and mVUregs so back them up
|
||||
microBlock* pBlock = mVUpBlock;
|
||||
memcpy_const(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
|
||||
memcpy(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
|
||||
|
||||
incPC2(1); // Get PC for branch not-taken
|
||||
mVUcompile(mVU, xPC, (uptr)&mVUregs);
|
||||
|
|
|
@ -427,10 +427,10 @@ __fi void mVUinitFirstPass(microVU& mVU, uptr pState, u8* thisPtr) {
|
|||
mVU.p = 0; // All blocks start at p index #0
|
||||
mVU.q = 0; // All blocks start at q index #0
|
||||
if ((uptr)&mVUregs != pState) { // Loads up Pipeline State Info
|
||||
memcpy_const((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo));
|
||||
memcpy((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo));
|
||||
}
|
||||
if (doEarlyExit(mVU) && ((uptr)&mVU.prog.lpState != pState)) {
|
||||
memcpy_const((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
|
||||
memcpy((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
|
||||
}
|
||||
mVUblock.x86ptrStart = thisPtr;
|
||||
mVUpBlock = mVUblocks[mVUstartPC/2]->add(&mVUblock); // Add this block to block manager
|
||||
|
@ -530,7 +530,7 @@ void mVUDoTBit(microVU& mVU, microFlagCycles* mFC)
|
|||
|
||||
void mVUSaveFlags(microVU& mVU,microFlagCycles &mFC, microFlagCycles &mFCBackup)
|
||||
{
|
||||
memcpy_fast(&mFCBackup, &mFC, sizeof(microFlagCycles));
|
||||
memcpy(&mFCBackup, &mFC, sizeof(microFlagCycles));
|
||||
mVUsetFlags(mVU, mFCBackup); // Sets Up Flag instances
|
||||
}
|
||||
void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
|
||||
|
|
|
@ -86,7 +86,7 @@ public:
|
|||
wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
|
||||
);
|
||||
}
|
||||
memcpy_const(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
|
||||
memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
|
||||
}
|
||||
void clear() {
|
||||
for (int i = 0; i < hSize; i++) {
|
||||
|
|
|
@ -117,7 +117,7 @@ _vifT int nVifUnpack(const u8* data) {
|
|||
|
||||
if (ret == vif.tag.size) { // Full Transfer
|
||||
if (v.bSize) { // Last transfer was partial
|
||||
memcpy_aligned(&v.buffer[v.bSize], data, size);
|
||||
memcpy(&v.buffer[v.bSize], data, size);
|
||||
v.bSize += size;
|
||||
size = v.bSize;
|
||||
data = v.buffer;
|
||||
|
@ -140,7 +140,7 @@ _vifT int nVifUnpack(const u8* data) {
|
|||
v.bSize = 0;
|
||||
}
|
||||
else { // Partial Transfer
|
||||
memcpy_aligned(&v.buffer[v.bSize], data, size);
|
||||
memcpy(&v.buffer[v.bSize], data, size);
|
||||
v.bSize += size;
|
||||
vif.tag.size -= ret;
|
||||
|
||||
|
|
|
@ -131,15 +131,15 @@ void recSuperVU1::Execute(u32 cycles) {
|
|||
#endif
|
||||
|
||||
runCount++;
|
||||
memcpy_const((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs));
|
||||
memcpy_const((u8*)backVUmem, (u8*) VU1.Mem, 0x4000);
|
||||
memcpy((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs));
|
||||
memcpy((u8*)backVUmem, (u8*) VU1.Mem, 0x4000);
|
||||
|
||||
runMVU1(cycles);
|
||||
|
||||
memcpy_const((u8*)cmpVUregs,(u8*)&VU1, sizeof(VURegs));
|
||||
memcpy_const((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000);
|
||||
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
memcpy((u8*)cmpVUregs,(u8*)&VU1, sizeof(VURegs));
|
||||
memcpy((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000);
|
||||
memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
|
||||
runSVU1(cycles);
|
||||
if ((memcmp((u8*)cmpVUregs, (u8*)&VU1, (16*32) + (16*16))) || (memcmp((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000))) {
|
||||
|
@ -230,8 +230,8 @@ void recSuperVU1::Execute(u32 cycles) {
|
|||
|
||||
resetMVU1();
|
||||
|
||||
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
|
||||
memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
|
||||
|
||||
runMVU1(cycles);
|
||||
|
||||
|
|
|
@ -898,7 +898,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
|
|||
#ifdef SUPERVU_CACHING
|
||||
//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
|
||||
r.pmem = malloc(r.size);
|
||||
memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
|
||||
memcpy(r.pmem, &VU->Micro[r.start], r.size);
|
||||
#endif
|
||||
s_pFnHeader->ranges.push_back(r);
|
||||
}
|
||||
|
|
|
@ -647,7 +647,6 @@ char *SysLibError(); // Gets previous error loading sysbols
|
|||
void SysCloseLibrary(void *lib); // Closes Library
|
||||
void SysMessage(char *fmt, ...);
|
||||
|
||||
extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
|
||||
extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);
|
||||
|
||||
template <typename T>
|
||||
|
|
|
@ -148,7 +148,6 @@
|
|||
<ClCompile Include="Conf.cpp" />
|
||||
<ClCompile Include="..\GSmain.cpp" />
|
||||
<ClCompile Include="..\Mem.cpp" />
|
||||
<ClCompile Include="..\memcpy_amd.cpp" />
|
||||
<ClCompile Include="..\Regs.cpp" />
|
||||
<ClCompile Include="..\targets.cpp" />
|
||||
<ClCompile Include="Win32.cpp" />
|
||||
|
|
|
@ -24,9 +24,6 @@
|
|||
<ClCompile Include="..\Mem.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\memcpy_amd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Regs.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
|
|
@ -154,7 +154,6 @@
|
|||
<ClCompile Include="Conf.cpp" />
|
||||
<ClCompile Include="..\GSmain.cpp" />
|
||||
<ClCompile Include="..\Mem.cpp" />
|
||||
<ClCompile Include="..\memcpy_amd.cpp" />
|
||||
<ClCompile Include="..\Regs.cpp" />
|
||||
<ClCompile Include="..\targets.cpp" />
|
||||
<ClCompile Include="Win32.cpp" />
|
||||
|
|
|
@ -24,9 +24,6 @@
|
|||
<ClCompile Include="..\Mem.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\memcpy_amd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Regs.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
|
|
@ -154,7 +154,6 @@
|
|||
<ClCompile Include="Conf.cpp" />
|
||||
<ClCompile Include="..\GSmain.cpp" />
|
||||
<ClCompile Include="..\Mem.cpp" />
|
||||
<ClCompile Include="..\memcpy_amd.cpp" />
|
||||
<ClCompile Include="..\Regs.cpp" />
|
||||
<ClCompile Include="..\targets.cpp" />
|
||||
<ClCompile Include="Win32.cpp" />
|
||||
|
|
|
@ -24,9 +24,6 @@
|
|||
<ClCompile Include="..\Mem.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\memcpy_amd.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Regs.cpp">
|
||||
<Filter>Source Files</Filter>
|
||||
</ClCompile>
|
||||
|
|
|
@ -1,479 +0,0 @@
|
|||
/******************************************************************************
|
||||
|
||||
Copyright (c) 2001 Advanced Micro Devices, Inc.
|
||||
|
||||
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
|
||||
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
|
||||
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
|
||||
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
|
||||
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
|
||||
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
|
||||
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
|
||||
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
|
||||
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
|
||||
NOT APPLY TO YOU.
|
||||
|
||||
AMD does not assume any responsibility for any errors which may appear in the
|
||||
Materials nor any responsibility to support or update the Materials. AMD retains
|
||||
the right to make changes to its test specifications at any time, without notice.
|
||||
|
||||
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
|
||||
further information, software, technical information, know-how, or show-how
|
||||
available to you.
|
||||
|
||||
So that all may benefit from your experience, please report any problems
|
||||
or suggestions about this software to 3dsdk.support@amd.com
|
||||
|
||||
AMD Developer Technologies, M/S 585
|
||||
Advanced Micro Devices, Inc.
|
||||
5900 E. Ben White Blvd.
|
||||
Austin, TX 78741
|
||||
3dsdk.support@amd.com
|
||||
******************************************************************************/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
/*****************************************************************************
|
||||
MEMCPY_AMD.CPP
|
||||
******************************************************************************/
|
||||
|
||||
// Very optimized memcpy() routine for AMD Athlon and Duron family.
|
||||
// This code uses any of FOUR different basic copy methods, depending
|
||||
// on the transfer size.
|
||||
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
||||
// "Streaming Store"), and also uses the software prefetch instructions,
|
||||
// be sure you're running on Athlon/Duron or other recent CPU before calling!
|
||||
|
||||
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop".
|
||||
|
||||
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
||||
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
||||
// also using the "unrolled loop" optimization. This code uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
|
||||
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
|
||||
|
||||
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
||||
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
|
||||
//#include <stddef.h>
|
||||
|
||||
// Inline assembly syntax for use with Visual C++
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "PS2Etypes.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__x86_64__)
|
||||
|
||||
void * memcpy_amd(void *dest, const void *src, size_t n)
|
||||
{
|
||||
__asm {
|
||||
mov ecx, [n] ; number of bytes to copy
|
||||
mov edi, [dest] ; destination
|
||||
mov esi, [src] ; source
|
||||
mov ebx, ecx ; keep a copy of count
|
||||
|
||||
cld
|
||||
cmp ecx, TINY_BLOCK_COPY
|
||||
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
||||
|
||||
cmp ecx, 32*1024 ; don't align between 32k-64k because
|
||||
jbe $memcpy_do_align ; it appears to be slower
|
||||
cmp ecx, 64*1024
|
||||
jbe $memcpy_align_done
|
||||
$memcpy_do_align:
|
||||
mov ecx, 8 ; a trick that's faster than rep movsb...
|
||||
sub ecx, edi ; align destination to qword
|
||||
and ecx, 111b ; get the low bits
|
||||
sub ebx, ecx ; update copy count
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_align_done
|
||||
jmp ecx ; jump to array of movsb's
|
||||
|
||||
align 4
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
|
||||
$memcpy_align_done: ; destination is dword aligned
|
||||
mov ecx, ebx ; number of bytes left to copy
|
||||
shr ecx, 6 ; get 64-byte block count
|
||||
jz $memcpy_ic_2 ; finish the last few bytes
|
||||
|
||||
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
||||
jae $memcpy_uc_test
|
||||
|
||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
align 16
|
||||
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0, [esi+0] ; read 64 bits
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0 ; write 64 bits
|
||||
movq [edi+8], mm1 ; note: the normal movq writes the
|
||||
movq mm2, [esi+16] ; data to cache; a cache line will be
|
||||
movq mm3, [esi+24] ; allocated as needed, to store the data
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
|
||||
add esi, 64 ; update source pointer
|
||||
add edi, 64 ; update destination pointer
|
||||
dec ecx ; count down
|
||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||
|
||||
$memcpy_ic_2:
|
||||
mov ecx, ebx ; has valid low 6 bits of the byte count
|
||||
$memcpy_ic_3:
|
||||
shr ecx, 2 ; dword count
|
||||
and ecx, 1111b ; only look at the "remainder" bits
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_last_few
|
||||
jmp ecx ; jump to array of movsd's
|
||||
|
||||
$memcpy_uc_test:
|
||||
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
||||
jae $memcpy_bp_1
|
||||
|
||||
$memcpy_64_test:
|
||||
or ecx, ecx ; tail end of block prefetch will jump here
|
||||
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
||||
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
align 16
|
||||
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0,[esi+0] ; read 64 bits
|
||||
add edi,64 ; update destination pointer
|
||||
movq mm1,[esi+8]
|
||||
add esi,64 ; update source pointer
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
||||
movq mm0,[esi-40] ; note: movntq also prevents the CPU
|
||||
movntq [edi-56], mm1 ; from READING the destination address
|
||||
movq mm1,[esi-32] ; into the cache, only to be over-written
|
||||
movntq [edi-48], mm2 ; so that also helps performance
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
dec ecx
|
||||
movntq [edi-8], mm1
|
||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||
|
||||
jmp $memcpy_ic_2 ; almost done
|
||||
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
$memcpy_bp_1: ; large blocks, block prefetch copy
|
||||
|
||||
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
||||
jl $memcpy_64_test ; no, back to regular uncached copy
|
||||
|
||||
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
||||
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
||||
align 16
|
||||
$memcpy_bp_2:
|
||||
mov edx, [esi-64] ; grab one address per cache line
|
||||
mov edx, [esi-128] ; grab one address per cache line
|
||||
sub esi, 128 ; go reverse order to suppress HW prefetcher
|
||||
dec eax ; count down the cache lines
|
||||
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
|
||||
|
||||
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
||||
align 16
|
||||
$memcpy_bp_3:
|
||||
movq mm0, [esi ] ; read 64 bits
|
||||
movq mm1, [esi+ 8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
add esi, 64 ; update source pointer
|
||||
movntq [edi ], mm0 ; write 64 bits, bypassing cache
|
||||
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
|
||||
movntq [edi+16], mm2 ; from READING the destination address
|
||||
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
||||
movntq [edi+32], mm4 ; so that also helps performance
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
add edi, 64 ; update dest pointer
|
||||
|
||||
dec eax ; count down
|
||||
|
||||
jnz $memcpy_bp_3 ; keep copying
|
||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
||||
jmp $memcpy_bp_1 ; keep processing chunks
|
||||
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||
align 4
|
||||
movsd
|
||||
movsd ; perform last 1-15 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd ; perform last 1-7 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
|
||||
$memcpy_last_few: ; dword aligned from before movsd's
|
||||
mov ecx, ebx ; has valid low 2 bits of the byte count
|
||||
and ecx, 11b ; the last few cows must come home
|
||||
jz $memcpy_final ; no more, let's leave
|
||||
rep movsb ; the last 1, 2, or 3 bytes
|
||||
|
||||
$memcpy_final:
|
||||
emms ; clean up the MMX state
|
||||
sfence ; flush the write buffer
|
||||
mov eax, [dest] ; ret value = destination pointer
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// mmx memcpy implementation, size has to be a multiple of 8
|
||||
// returns 0 is equal, nonzero value if not equal
|
||||
// ~10 times faster than standard memcmp
|
||||
// (zerofrog)
|
||||
u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
|
||||
{
|
||||
assert( (cmpsize&7) == 0 );
|
||||
|
||||
__asm {
|
||||
push esi
|
||||
mov ecx, cmpsize
|
||||
mov edx, src1
|
||||
mov esi, src2
|
||||
|
||||
cmp ecx, 32
|
||||
jl Done4
|
||||
|
||||
// custom test first 8 to make sure things are ok
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pand mm0, mm1
|
||||
movq mm2, [esi+16]
|
||||
pmovmskb eax, mm0
|
||||
movq mm3, [esi+24]
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je NextComp
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
NextComp:
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pcmpeqd mm3, [edx+24]
|
||||
pand mm2, mm3
|
||||
pmovmskb eax, mm2
|
||||
|
||||
sub ecx, 32
|
||||
add esi, 32
|
||||
add edx, 32
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je ContinueTest
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
cmp ecx, 64
|
||||
jl Done8
|
||||
|
||||
Cmp8:
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pcmpeqd mm3, [edx+24]
|
||||
pand mm0, mm1
|
||||
pcmpeqd mm4, [edx+32]
|
||||
pand mm0, mm2
|
||||
pcmpeqd mm5, [edx+40]
|
||||
pand mm0, mm3
|
||||
pcmpeqd mm6, [edx+48]
|
||||
pand mm0, mm4
|
||||
pcmpeqd mm7, [edx+56]
|
||||
pand mm0, mm5
|
||||
pand mm0, mm6
|
||||
pand mm0, mm7
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je Continue
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
Continue:
|
||||
sub ecx, 64
|
||||
add esi, 64
|
||||
add edx, 64
|
||||
ContinueTest:
|
||||
cmp ecx, 64
|
||||
jge Cmp8
|
||||
|
||||
Done8:
|
||||
test ecx, 0x20
|
||||
jz Done4
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pcmpeqd mm3, [edx+24]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pand mm0, mm3
|
||||
pmovmskb eax, mm0
|
||||
sub ecx, 32
|
||||
add esi, 32
|
||||
add edx, 32
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je Done4
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
Done4:
|
||||
cmp ecx, 24
|
||||
jne Done2
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
movq mm2, [esi+16]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
setne al
|
||||
jmp End
|
||||
|
||||
Done2:
|
||||
cmp ecx, 16
|
||||
jne Done1
|
||||
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pand mm0, mm1
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
setne al
|
||||
jmp End
|
||||
|
||||
Done1:
|
||||
cmp ecx, 8
|
||||
jne Done
|
||||
|
||||
mov eax, [esi]
|
||||
mov esi, [esi+4]
|
||||
cmp eax, [edx]
|
||||
je Next
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
Next:
|
||||
cmp esi, [edx+4]
|
||||
setne al
|
||||
jmp End
|
||||
|
||||
Done:
|
||||
xor eax, eax
|
||||
|
||||
End:
|
||||
pop esi
|
||||
emms
|
||||
}
|
||||
}
|
||||
|
||||
#else // _MSC_VER
|
||||
// assume gcc or mingw or win x64
|
||||
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
|
||||
void * memcpy_amd(void *dest, const void *src, size_t n)
|
||||
{
|
||||
memcpy(dest, src, n);
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
}
|
|
@ -2026,7 +2026,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
|
|||
targ->clut.resize(clutsize);
|
||||
|
||||
if( tex0.cpsm <= 1 ) { // 32 bit
|
||||
memcpy_amd(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
|
||||
memcpy(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
|
||||
}
|
||||
else {
|
||||
u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
|
||||
|
@ -2110,7 +2110,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
|
|||
targ->memory->ref = 1;
|
||||
}
|
||||
|
||||
memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
|
||||
memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
|
||||
|
||||
u8* psrc = (u8*)(ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy);
|
||||
|
||||
|
@ -2136,7 +2136,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
|
|||
targ->memory->ref = 1;
|
||||
}
|
||||
|
||||
memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
|
||||
memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
|
||||
|
||||
// needs to be 8 bit, use xmm for unpacking
|
||||
u16* dst = (u16*)lock.pBits;
|
||||
|
@ -2219,7 +2219,7 @@ Z16Loop:
|
|||
targ->memory = NULL;
|
||||
}
|
||||
|
||||
memcpy_amd(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height );
|
||||
memcpy(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2239,7 +2239,7 @@ void ZeroGS::Flush(int context)
|
|||
}
|
||||
|
||||
if( curvb.tex0.cpsm <= 1 ) { // 32 bit
|
||||
memcpy_amd(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
|
||||
memcpy(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
|
||||
}
|
||||
else {
|
||||
u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
|
||||
|
@ -5087,7 +5087,7 @@ void ZeroGS::CaptureFrame()
|
|||
|
||||
BYTE* pend = (BYTE*)lock.pBits + (conf.height-1)*width*4;
|
||||
for(int i = 0; i < conf.height; ++i) {
|
||||
memcpy_amd(&mem[width*4*i], pend - width*4*i, width * 4);
|
||||
memcpy(&mem[width*4*i], pend - width*4*i, width * 4);
|
||||
}
|
||||
s_ptexAVICapture->UnlockRect();
|
||||
|
||||
|
|
|
@ -36,7 +36,6 @@ set(zerogsSources
|
|||
GSmain.cpp
|
||||
GLWinX11.cpp
|
||||
Mem.cpp
|
||||
memcpy_amd.cpp
|
||||
rasterfont.cpp
|
||||
Regs.cpp
|
||||
targets.cpp
|
||||
|
|
|
@ -728,7 +728,6 @@ char *SysLibError(); // Gets previous error loading sysbols
|
|||
void SysCloseLibrary(void *lib); // Closes Library
|
||||
void SysMessage(char *fmt, ...);
|
||||
|
||||
extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
|
||||
extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);
|
||||
|
||||
template <typename T>
|
||||
|
|
|
@ -23,7 +23,7 @@ libZeroGSogl_LDFLAGS+=-Wl,-soname,@ZEROGS_SONAME@
|
|||
libZeroGSogl_LDADD=$(libZeroGSogl_a_OBJECTS)
|
||||
|
||||
libZeroGSogl_a_SOURCES = \
|
||||
GSmain.cpp memcpy_amd.cpp Regs.cpp x86.cpp zpipe.cpp \
|
||||
GSmain.cpp Regs.cpp x86.cpp zpipe.cpp \
|
||||
Mem.cpp rasterfont.cpp targets.cpp zerogs.cpp GifTransfer.cpp GLWinX11.cpp
|
||||
|
||||
libZeroGSogl_a_SOURCES += x86-32.S
|
||||
|
|
|
@ -1,478 +0,0 @@
|
|||
/******************************************************************************
|
||||
|
||||
Copyright (c) 2001 Advanced Micro Devices, Inc.
|
||||
|
||||
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
|
||||
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
|
||||
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
|
||||
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
|
||||
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
|
||||
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
|
||||
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
|
||||
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
|
||||
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
|
||||
NOT APPLY TO YOU.
|
||||
|
||||
AMD does not assume any responsibility for any errors which may appear in the
|
||||
Materials nor any responsibility to support or update the Materials. AMD retains
|
||||
the right to make changes to its test specifications at any time, without notice.
|
||||
|
||||
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
|
||||
further information, software, technical information, know-how, or show-how
|
||||
available to you.
|
||||
|
||||
So that all may benefit from your experience, please report any problems
|
||||
or suggestions about this software to 3dsdk.support@amd.com
|
||||
|
||||
AMD Developer Technologies, M/S 585
|
||||
Advanced Micro Devices, Inc.
|
||||
5900 E. Ben White Blvd.
|
||||
Austin, TX 78741
|
||||
3dsdk.support@amd.com
|
||||
******************************************************************************/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
/*****************************************************************************
|
||||
MEMCPY_AMD.CPP
|
||||
******************************************************************************/
|
||||
|
||||
// Very optimized memcpy() routine for AMD Athlon and Duron family.
|
||||
// This code uses any of FOUR different basic copy methods, depending
|
||||
// on the transfer size.
|
||||
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
|
||||
// "Streaming Store"), and also uses the software prefetch instructions,
|
||||
// be sure you're running on Athlon/Duron or other recent CPU before calling!
|
||||
|
||||
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop".
|
||||
|
||||
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
|
||||
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
|
||||
// also using the "unrolled loop" optimization. This code uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
|
||||
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
|
||||
|
||||
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
|
||||
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
|
||||
//#include <stddef.h>
|
||||
|
||||
// Inline assembly syntax for use with Visual C++
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
#include "PS2Etypes.h"
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
|
||||
void * memcpy_amd(void *dest, const void *src, size_t n)
|
||||
{
|
||||
__asm {
|
||||
mov ecx, [n] ; number of bytes to copy
|
||||
mov edi, [dest] ; destination
|
||||
mov esi, [src] ; source
|
||||
mov ebx, ecx ; keep a copy of count
|
||||
|
||||
cld
|
||||
cmp ecx, TINY_BLOCK_COPY
|
||||
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
||||
|
||||
cmp ecx, 32*1024 ; don't align between 32k-64k because
|
||||
jbe $memcpy_do_align ; it appears to be slower
|
||||
cmp ecx, 64*1024
|
||||
jbe $memcpy_align_done
|
||||
$memcpy_do_align:
|
||||
mov ecx, 8 ; a trick that's faster than rep movsb...
|
||||
sub ecx, edi ; align destination to qword
|
||||
and ecx, 111b ; get the low bits
|
||||
sub ebx, ecx ; update copy count
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_align_done
|
||||
jmp ecx ; jump to array of movsb's
|
||||
|
||||
align 4
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
movsb
|
||||
|
||||
$memcpy_align_done: ; destination is dword aligned
|
||||
mov ecx, ebx ; number of bytes left to copy
|
||||
shr ecx, 6 ; get 64-byte block count
|
||||
jz $memcpy_ic_2 ; finish the last few bytes
|
||||
|
||||
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
||||
jae $memcpy_uc_test
|
||||
|
||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
align 16
|
||||
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0, [esi+0] ; read 64 bits
|
||||
movq mm1, [esi+8]
|
||||
movq [edi+0], mm0 ; write 64 bits
|
||||
movq [edi+8], mm1 ; note: the normal movq writes the
|
||||
movq mm2, [esi+16] ; data to cache; a cache line will be
|
||||
movq mm3, [esi+24] ; allocated as needed, to store the data
|
||||
movq [edi+16], mm2
|
||||
movq [edi+24], mm3
|
||||
movq mm0, [esi+32]
|
||||
movq mm1, [esi+40]
|
||||
movq [edi+32], mm0
|
||||
movq [edi+40], mm1
|
||||
movq mm2, [esi+48]
|
||||
movq mm3, [esi+56]
|
||||
movq [edi+48], mm2
|
||||
movq [edi+56], mm3
|
||||
|
||||
add esi, 64 ; update source pointer
|
||||
add edi, 64 ; update destination pointer
|
||||
dec ecx ; count down
|
||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||
|
||||
$memcpy_ic_2:
|
||||
mov ecx, ebx ; has valid low 6 bits of the byte count
|
||||
$memcpy_ic_3:
|
||||
shr ecx, 2 ; dword count
|
||||
and ecx, 1111b ; only look at the "remainder" bits
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_last_few
|
||||
jmp ecx ; jump to array of movsd's
|
||||
|
||||
$memcpy_uc_test:
|
||||
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
||||
jae $memcpy_bp_1
|
||||
|
||||
$memcpy_64_test:
|
||||
or ecx, ecx ; tail end of block prefetch will jump here
|
||||
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
||||
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
align 16
|
||||
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||
|
||||
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
|
||||
|
||||
movq mm0,[esi+0] ; read 64 bits
|
||||
add edi,64 ; update destination pointer
|
||||
movq mm1,[esi+8]
|
||||
add esi,64 ; update source pointer
|
||||
movq mm2,[esi-48]
|
||||
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
|
||||
movq mm0,[esi-40] ; note: movntq also prevents the CPU
|
||||
movntq [edi-56], mm1 ; from READING the destination address
|
||||
movq mm1,[esi-32] ; into the cache, only to be over-written
|
||||
movntq [edi-48], mm2 ; so that also helps performance
|
||||
movq mm2,[esi-24]
|
||||
movntq [edi-40], mm0
|
||||
movq mm0,[esi-16]
|
||||
movntq [edi-32], mm1
|
||||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
dec ecx
|
||||
movntq [edi-8], mm1
|
||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||
|
||||
jmp $memcpy_ic_2 ; almost done
|
||||
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
$memcpy_bp_1: ; large blocks, block prefetch copy
|
||||
|
||||
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
||||
jl $memcpy_64_test ; no, back to regular uncached copy
|
||||
|
||||
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
|
||||
add esi, CACHEBLOCK * 64 ; move to the top of the block
|
||||
align 16
|
||||
$memcpy_bp_2:
|
||||
mov edx, [esi-64] ; grab one address per cache line
|
||||
mov edx, [esi-128] ; grab one address per cache line
|
||||
sub esi, 128 ; go reverse order to suppress HW prefetcher
|
||||
dec eax ; count down the cache lines
|
||||
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
|
||||
|
||||
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
|
||||
align 16
|
||||
$memcpy_bp_3:
|
||||
movq mm0, [esi ] ; read 64 bits
|
||||
movq mm1, [esi+ 8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
add esi, 64 ; update source pointer
|
||||
movntq [edi ], mm0 ; write 64 bits, bypassing cache
|
||||
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
|
||||
movntq [edi+16], mm2 ; from READING the destination address
|
||||
movntq [edi+24], mm3 ; into the cache, only to be over-written,
|
||||
movntq [edi+32], mm4 ; so that also helps performance
|
||||
movntq [edi+40], mm5
|
||||
movntq [edi+48], mm6
|
||||
movntq [edi+56], mm7
|
||||
add edi, 64 ; update dest pointer
|
||||
|
||||
dec eax ; count down
|
||||
|
||||
jnz $memcpy_bp_3 ; keep copying
|
||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
||||
jmp $memcpy_bp_1 ; keep processing chunks
|
||||
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||
align 4
|
||||
movsd
|
||||
movsd ; perform last 1-15 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd ; perform last 1-7 dword copies
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
movsd
|
||||
|
||||
$memcpy_last_few: ; dword aligned from before movsd's
|
||||
mov ecx, ebx ; has valid low 2 bits of the byte count
|
||||
and ecx, 11b ; the last few cows must come home
|
||||
jz $memcpy_final ; no more, let's leave
|
||||
rep movsb ; the last 1, 2, or 3 bytes
|
||||
|
||||
$memcpy_final:
|
||||
emms ; clean up the MMX state
|
||||
sfence ; flush the write buffer
|
||||
mov eax, [dest] ; ret value = destination pointer
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// mmx memcpy implementation, size has to be a multiple of 8
|
||||
// returns 0 is equal, nonzero value if not equal
|
||||
// ~10 times faster than standard memcmp
|
||||
// (zerofrog)
|
||||
u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
|
||||
{
|
||||
assert( (cmpsize&7) == 0 );
|
||||
|
||||
__asm {
|
||||
push esi
|
||||
mov ecx, cmpsize
|
||||
mov edx, src1
|
||||
mov esi, src2
|
||||
|
||||
cmp ecx, 32
|
||||
jl Done4
|
||||
|
||||
// custom test first 8 to make sure things are ok
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pand mm0, mm1
|
||||
movq mm2, [esi+16]
|
||||
pmovmskb eax, mm0
|
||||
movq mm3, [esi+24]
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je NextComp
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
NextComp:
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pcmpeqd mm3, [edx+24]
|
||||
pand mm2, mm3
|
||||
pmovmskb eax, mm2
|
||||
|
||||
sub ecx, 32
|
||||
add esi, 32
|
||||
add edx, 32
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je ContinueTest
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
cmp ecx, 64
|
||||
jl Done8
|
||||
|
||||
Cmp8:
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
movq mm4, [esi+32]
|
||||
movq mm5, [esi+40]
|
||||
movq mm6, [esi+48]
|
||||
movq mm7, [esi+56]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pcmpeqd mm3, [edx+24]
|
||||
pand mm0, mm1
|
||||
pcmpeqd mm4, [edx+32]
|
||||
pand mm0, mm2
|
||||
pcmpeqd mm5, [edx+40]
|
||||
pand mm0, mm3
|
||||
pcmpeqd mm6, [edx+48]
|
||||
pand mm0, mm4
|
||||
pcmpeqd mm7, [edx+56]
|
||||
pand mm0, mm5
|
||||
pand mm0, mm6
|
||||
pand mm0, mm7
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je Continue
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
Continue:
|
||||
sub ecx, 64
|
||||
add esi, 64
|
||||
add edx, 64
|
||||
ContinueTest:
|
||||
cmp ecx, 64
|
||||
jge Cmp8
|
||||
|
||||
Done8:
|
||||
test ecx, 0x20
|
||||
jz Done4
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
movq mm2, [esi+16]
|
||||
movq mm3, [esi+24]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pcmpeqd mm3, [edx+24]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pand mm0, mm3
|
||||
pmovmskb eax, mm0
|
||||
sub ecx, 32
|
||||
add esi, 32
|
||||
add edx, 32
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
je Done4
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
Done4:
|
||||
cmp ecx, 24
|
||||
jne Done2
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
movq mm2, [esi+16]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pcmpeqd mm2, [edx+16]
|
||||
pand mm0, mm1
|
||||
pand mm0, mm2
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
setne al
|
||||
jmp End
|
||||
|
||||
Done2:
|
||||
cmp ecx, 16
|
||||
jne Done1
|
||||
|
||||
movq mm0, [esi]
|
||||
movq mm1, [esi+8]
|
||||
pcmpeqd mm0, [edx]
|
||||
pcmpeqd mm1, [edx+8]
|
||||
pand mm0, mm1
|
||||
pmovmskb eax, mm0
|
||||
|
||||
// check if eq
|
||||
cmp eax, 0xff
|
||||
setne al
|
||||
jmp End
|
||||
|
||||
Done1:
|
||||
cmp ecx, 8
|
||||
jne Done
|
||||
|
||||
mov eax, [esi]
|
||||
mov esi, [esi+4]
|
||||
cmp eax, [edx]
|
||||
je Next
|
||||
mov eax, 1
|
||||
jmp End
|
||||
|
||||
Next:
|
||||
cmp esi, [edx+4]
|
||||
setne al
|
||||
jmp End
|
||||
|
||||
Done:
|
||||
xor eax, eax
|
||||
|
||||
End:
|
||||
pop esi
|
||||
emms
|
||||
}
|
||||
}
|
||||
|
||||
#else // _MSC_VER
|
||||
// assume gcc
|
||||
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
|
||||
void * memcpy_amd(void *dest, const void *src, size_t n)
|
||||
{
|
||||
memcpy(dest, src, n);
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
}
|
|
@ -1789,7 +1789,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
|
|||
targ->clut.resize(clutsize);
|
||||
|
||||
if( tex0.cpsm <= 1 ) { // 32 bit
|
||||
memcpy_amd(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize);
|
||||
memcpy(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize);
|
||||
}
|
||||
else {
|
||||
u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
|
||||
|
@ -1854,7 +1854,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
|
|||
assert(targ->ptex->ref > 0 );
|
||||
}
|
||||
|
||||
memcpy_amd(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
|
||||
memcpy(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
|
||||
vector<u8> texdata;
|
||||
u8* ptexdata = NULL;
|
||||
|
||||
|
|
|
@ -2568,7 +2568,7 @@ void ZeroGS::Flush(int context)
|
|||
g_nCurVBOIndex = (g_nCurVBOIndex+1)%g_vboBuffers.size();
|
||||
glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
|
||||
// void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
|
||||
// memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
|
||||
// memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
|
||||
// glUnmapBuffer(GL_ARRAY_BUFFER);
|
||||
SET_STREAM();
|
||||
|
||||
|
@ -2652,7 +2652,7 @@ void ZeroGS::Flush(int context)
|
|||
}
|
||||
|
||||
if( curvb.tex0.cpsm <= 1 ) { // 32 bit
|
||||
memcpy_amd(&data[0], g_pbyGSClut+nClutOffset, clutsize);
|
||||
memcpy(&data[0], g_pbyGSClut+nClutOffset, clutsize);
|
||||
}
|
||||
else {
|
||||
u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
|
||||
|
@ -5779,7 +5779,7 @@ void ZeroGS::CaptureFrame()
|
|||
|
||||
// u8* pend = (u8*)&data[0] + (nBackbufferHeight-1)*nBackbufferWidth*4;
|
||||
// for(int i = 0; i < conf.height; ++i) {
|
||||
// memcpy_amd(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4);
|
||||
// memcpy(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4);
|
||||
// }
|
||||
|
||||
int fps = SMODE1->CMOD == 3 ? 50 : 60;
|
||||
|
|
|
@ -436,7 +436,7 @@ namespace ZeroGS {
|
|||
if( nCount + nVerts > nNumVertices ) {
|
||||
// recreate except with a bigger count
|
||||
VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU)*nNumVertices*2, 256);
|
||||
memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
|
||||
memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
|
||||
nNumVertices *= 2;
|
||||
assert( nCount + nVerts <= nNumVertices );
|
||||
_aligned_free(pBufferData);
|
||||
|
|
|
@ -55,7 +55,6 @@ set(zzoglSources
|
|||
GSmain.cpp
|
||||
HostMemory.cpp
|
||||
Mem.cpp
|
||||
# memcpy_amd.cpp
|
||||
Mem_Swizzle.cpp
|
||||
Mem_Tables.cpp
|
||||
Profile.cpp
|
||||
|
|
|
@ -68,7 +68,6 @@ extern "C" char* CALLBACK PS2EgetLibName(void);
|
|||
#include "GSDump.h"
|
||||
|
||||
#include "Utilities/MemcpyFast.h"
|
||||
#define memcpy_amd memcpy_fast
|
||||
|
||||
extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path
|
||||
|
||||
|
|
|
@ -493,7 +493,7 @@ template <>
|
|||
/*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
|
||||
{
|
||||
u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
|
||||
memcpy_amd((u8*)dst, clut, clutsize);
|
||||
memcpy((u8*)dst, clut, clutsize);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
|
|
@ -657,7 +657,7 @@ inline void FlushSetStream(VB& curvb)
|
|||
g_nCurVBOIndex = (g_nCurVBOIndex + 1) % g_vboBuffers.size();
|
||||
glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
|
||||
// void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
|
||||
// memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
|
||||
// memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
|
||||
// glUnmapBuffer(GL_ARRAY_BUFFER);
|
||||
SET_STREAM();
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ class VB
|
|||
assert(pBufferData != NULL);
|
||||
nNumVertices *= 2;
|
||||
VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
|
||||
memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
|
||||
memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
|
||||
assert(nCount <= nNumVertices);
|
||||
_aligned_free(pBufferData);
|
||||
pBufferData = ptemp;
|
||||
|
|
|
@ -1979,7 +1979,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
|
|||
assert(targ->ptex->ref > 0);
|
||||
}
|
||||
|
||||
memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
|
||||
memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
|
||||
|
||||
__aligned16 u8* ptexdata = NULL;
|
||||
bool has_data = false;
|
||||
|
|
|
@ -86,7 +86,6 @@ typedef signed long long int64;
|
|||
#include "GSDump.h"
|
||||
|
||||
#include "Utilities/MemcpyFast.h"
|
||||
#define memcpy_amd memcpy_fast
|
||||
|
||||
extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path
|
||||
|
||||
|
|
|
@ -489,7 +489,7 @@ template <>
|
|||
/*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
|
||||
{
|
||||
u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
|
||||
memcpy_amd((u8*)dst, clut, clutsize);
|
||||
memcpy((u8*)dst, clut, clutsize);
|
||||
}
|
||||
|
||||
template <>
|
||||
|
|
|
@ -364,7 +364,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
|
|||
assert(targ->ptex->ref > 0);
|
||||
}
|
||||
|
||||
memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
|
||||
memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
|
||||
|
||||
__aligned16 u8* ptexdata = NULL;
|
||||
bool has_data = false;
|
||||
|
|
|
@ -535,7 +535,7 @@ inline void FlushSetStream(VB& curvb)
|
|||
|
||||
|
||||
// void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
|
||||
// memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
|
||||
// memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
|
||||
// glUnmapBuffer(GL_ARRAY_BUFFER);
|
||||
SET_STREAM();
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ class VB
|
|||
assert(pBufferData != NULL);
|
||||
nNumVertices *= 2;
|
||||
VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
|
||||
memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
|
||||
memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
|
||||
assert(nCount <= nNumVertices);
|
||||
_aligned_free(pBufferData);
|
||||
pBufferData = ptemp;
|
||||
|
|
Loading…
Reference in New Issue