Merge pull request #215 from xsacha/memcpy

Remove some slow, redundant memcpy implementations: memcpy_const/memcpy_...
This commit is contained in:
ramapcsx2 2014-09-12 19:57:57 +02:00
commit 98d22f8b2e
58 changed files with 116 additions and 1092 deletions

View File

@ -118,7 +118,7 @@ template< typename T >
SafeArray<T>* SafeArray<T>::Clone() const SafeArray<T>* SafeArray<T>::Clone() const
{ {
SafeArray<T>* retval = new SafeArray<T>( m_size ); SafeArray<T>* retval = new SafeArray<T>( m_size );
memcpy_fast( retval->GetPtr(), m_ptr, sizeof(T) * m_size ); memcpy( retval->GetPtr(), m_ptr, sizeof(T) * m_size );
return retval; return retval;
} }
@ -160,7 +160,7 @@ template< typename T, uint Alignment >
SafeAlignedArray<T,Alignment>* SafeAlignedArray<T,Alignment>::Clone() const SafeAlignedArray<T,Alignment>* SafeAlignedArray<T,Alignment>::Clone() const
{ {
SafeAlignedArray<T,Alignment>* retval = new SafeAlignedArray<T,Alignment>( this->m_size ); SafeAlignedArray<T,Alignment>* retval = new SafeAlignedArray<T,Alignment>( this->m_size );
memcpy_fast( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size ); memcpy( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size );
return retval; return retval;
} }
@ -272,14 +272,14 @@ void SafeList<T>::Remove( int index )
int copylen = m_length - index; int copylen = m_length - index;
if( copylen > 0 ) if( copylen > 0 )
memcpy_fast( &m_ptr[index], &m_ptr[index+1], copylen ); memcpy( &m_ptr[index], &m_ptr[index+1], copylen );
} }
template< typename T > template< typename T >
SafeList<T>* SafeList<T>::Clone() const SafeList<T>* SafeList<T>::Clone() const
{ {
SafeList<T>* retval = new SafeList<T>( m_length ); SafeList<T>* retval = new SafeList<T>( m_length );
memcpy_fast( retval->m_ptr, m_ptr, sizeof(T) * m_length ); memcpy( retval->m_ptr, m_ptr, sizeof(T) * m_length );
return retval; return retval;
} }

View File

@ -265,7 +265,7 @@ FastFormatUnicode& FastFormatUnicode::WriteV( const char* fmt, va_list argptr )
const uint inspos = m_Length; const uint inspos = m_Length;
const uint convLen = converted.Length(); const uint convLen = converted.Length();
m_dest->MakeRoomFor((inspos + convLen + 64) * sizeof(wxChar)); m_dest->MakeRoomFor((inspos + convLen + 64) * sizeof(wxChar));
memcpy_fast( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) ); memcpy( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) );
m_Length += convLen; m_Length += convLen;
return *this; return *this;

View File

@ -60,7 +60,7 @@ void xSmartJump::SetTarget()
u8* destpos = xGetPtr(); u8* destpos = xGetPtr();
const int copylen = (sptr)target - (sptr)saveme; const int copylen = (sptr)target - (sptr)saveme;
memcpy_fast( destpos, saveme, copylen ); memcpy( destpos, saveme, copylen );
xSetPtr( target - spacer ); xSetPtr( target - spacer );
} }
} }

View File

@ -725,7 +725,7 @@ int cdvdReadSector() {
mdest[11] = 0; mdest[11] = 0;
// normal 2048 bytes of sector data // normal 2048 bytes of sector data
memcpy_const(&mdest[12], cdr.Transfer, 2048); memcpy(&mdest[12], cdr.Transfer, 2048);
// 4 bytes of edc (not calculated at present) // 4 bytes of edc (not calculated at present)
mdest[2060] = 0; mdest[2060] = 0;
@ -735,7 +735,7 @@ int cdvdReadSector() {
} }
else else
{ {
memcpy_fast( mdest, cdr.Transfer, cdvd.BlockSize); memcpy( mdest, cdr.Transfer, cdvd.BlockSize);
} }
// decrypt sector's bytes // decrypt sector's bytes
@ -1567,7 +1567,7 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
cdvd.Param[cdvd.ParamP-5], cdvd.Param[cdvd.ParamP-3], cdvd.Param[cdvd.ParamP-2], cdvd.Param[cdvd.ParamP-1]); cdvd.Param[cdvd.ParamP-5], cdvd.Param[cdvd.ParamP-3], cdvd.Param[cdvd.ParamP-2], cdvd.Param[cdvd.ParamP-1]);
Console.WriteLn("RTC Write Sec %d Min %d Hr %d Day %d Month %d Year %d", cdvd.RTC.second, cdvd.RTC.minute, Console.WriteLn("RTC Write Sec %d Min %d Hr %d Day %d Month %d Year %d", cdvd.RTC.second, cdvd.RTC.minute,
cdvd.RTC.hour, cdvd.RTC.day, cdvd.RTC.month, cdvd.RTC.year);*/ cdvd.RTC.hour, cdvd.RTC.day, cdvd.RTC.month, cdvd.RTC.year);*/
//memcpy_fast((u8*)&cdvd.RTC, cdvd.Param, 7); //memcpy((u8*)&cdvd.RTC, cdvd.Param, 7);
break; break;
case 0x0A: // sceCdReadNVM (2:3) case 0x0A: // sceCdReadNVM (2:3)
@ -1907,7 +1907,7 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
} }
else else
{ {
memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC); memcpy(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
cdvd.mg_size += cdvd.ParamC; cdvd.mg_size += cdvd.ParamC;
cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
} }
@ -1915,9 +1915,9 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
case 0x8E: // sceMgReadData case 0x8E: // sceMgReadData
SetResultSize( std::min(16, cdvd.mg_size) ); SetResultSize( std::min(16, cdvd.mg_size) );
memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC); memcpy(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
cdvd.mg_size -= cdvd.ResultC; cdvd.mg_size -= cdvd.ResultC;
memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size); memcpy(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
break; break;
case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;) case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;)
@ -1984,7 +1984,7 @@ static void cdvdWrite16(u8 rt) // SCOMMAND
{ {
SetResultSize(3);//in:0 SetResultSize(3);//in:0
int bit_ofs = mg_BIToffset(cdvd.mg_buffer); int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]); memcpy(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
cdvd.mg_maxsize = 0; // don't allow any write cdvd.mg_maxsize = 0; // don't allow any write
cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data

View File

@ -422,7 +422,7 @@ s32 CALLBACK ISOreadSector(u8* tempbuffer, u32 lsn, int mode)
jNO_DEFAULT jNO_DEFAULT
} }
memcpy_fast(tempbuffer, pbuffer, psize); memcpy(tempbuffer, pbuffer, psize);
return 0; return 0;
} }

View File

@ -917,7 +917,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
} }
cdsize = (bcr & 0xffff) * 4; cdsize = (bcr & 0xffff) * 4;
memcpy_fast(iopPhysMem(madr), cdr.pTransfer, cdsize); memcpy(iopPhysMem(madr), cdr.pTransfer, cdsize);
psxCpu->Clear(madr, cdsize/4); psxCpu->Clear(madr, cdsize/4);
cdr.pTransfer+=cdsize; cdr.pTransfer+=cdsize;
@ -947,7 +947,7 @@ s32 CALLBACK cdvdDmaRead(s32 channel, u32* data, u32 bytesLeft, u32* bytesProces
return 10000; return 10000;
} }
memcpy_fast(data, cdr.pTransfer, wordsLeft); memcpy(data, cdr.pTransfer, wordsLeft);
//psxCpu->Clear(madr, cdsize/4); //psxCpu->Clear(madr, cdsize/4);
cdr.pTransfer+=wordsLeft; cdr.pTransfer+=wordsLeft;
*wordsProcessed = wordsLeft; *wordsProcessed = wordsLeft;

View File

@ -145,7 +145,7 @@ int InputIsoFile::FinishRead3(u8* dst, uint mode)
length = end - _offset; length = end - _offset;
uint read_offset = (m_current_lsn - m_read_lsn) * m_blocksize; uint read_offset = (m_current_lsn - m_read_lsn) * m_blocksize;
memcpy_fast(dst + diff, m_readbuffer + ndiff + read_offset, length); memcpy(dst + diff, m_readbuffer + ndiff + read_offset, length);
if (m_type == ISOTYPE_CD && diff >= 12) if (m_type == ISOTYPE_CD && diff >= 12)
{ {

View File

@ -161,7 +161,7 @@ int IsoFile::internalRead(void* dest, int off, int len)
slen = (int) (maxOffset - currentOffset); slen = (int) (maxOffset - currentOffset);
} }
memcpy_fast((u8*)dest + off, currentSector + sectorOffset, slen); memcpy((u8*)dest + off, currentSector + sectorOffset, slen);
sectorOffset += slen; sectorOffset += slen;
currentOffset += slen; currentOffset += slen;

View File

@ -442,27 +442,27 @@ extern __aligned(32) MTGS_BufferedData RingBuffer;
inline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) { inline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) {
uint endpos = destStart + len; uint endpos = destStart + len;
if ( endpos < destSize ) { if ( endpos < destSize ) {
memcpy_qwc(&destBase[destStart], src, len ); memcpy(&destBase[destStart], src, len*16);
destStart += len; destStart += len;
} }
else { else {
uint firstcopylen = destSize - destStart; uint firstcopylen = destSize - destStart;
memcpy_qwc(&destBase[destStart], src, firstcopylen ); memcpy(&destBase[destStart], src, firstcopylen*16);
destStart = endpos % destSize; destStart = endpos % destSize;
memcpy_qwc(destBase, src+firstcopylen, destStart ); memcpy(destBase, src+firstcopylen, destStart*16);
} }
} }
inline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) { inline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) {
uint endpos = srcStart + len; uint endpos = srcStart + len;
if ( endpos < srcSize ) { if ( endpos < srcSize ) {
memcpy_qwc(dest, &srcBase[srcStart], len ); memcpy(dest, &srcBase[srcStart], len*16);
srcStart += len; srcStart += len;
} }
else { else {
uint firstcopylen = srcSize - srcStart; uint firstcopylen = srcSize - srcStart;
memcpy_qwc(dest, &srcBase[srcStart], firstcopylen ); memcpy(dest, &srcBase[srcStart], firstcopylen*16);
srcStart = endpos % srcSize; srcStart = endpos % srcSize;
memcpy_qwc(dest+firstcopylen, srcBase, srcStart ); memcpy(dest+firstcopylen, srcBase, srcStart*16);
} }
} }

View File

@ -222,7 +222,7 @@ struct Gif_Path {
} }
//DevCon.WriteLn("Realign Packet [%d]", curSize - offset); //DevCon.WriteLn("Realign Packet [%d]", curSize - offset);
if (intersect) memmove(buffer, &buffer[offset], curSize - offset); if (intersect) memmove(buffer, &buffer[offset], curSize - offset);
else memcpy_fast(buffer, &buffer[offset], curSize - offset); else memcpy(buffer, &buffer[offset], curSize - offset);
curSize -= offset; curSize -= offset;
curOffset = gsPack.size; curOffset = gsPack.size;
gsPack.offset = 0; gsPack.offset = 0;
@ -241,8 +241,7 @@ struct Gif_Path {
mtgsReadWait(); // Let MTGS run to free up buffer space mtgsReadWait(); // Let MTGS run to free up buffer space
} }
pxAssertDev(curSize+size<=buffSize, "Gif Path Buffer Overflow!"); pxAssertDev(curSize+size<=buffSize, "Gif Path Buffer Overflow!");
if (aligned) memcpy_qwc (&buffer[curSize], pMem, size/16); memcpy (&buffer[curSize], pMem, size);
else memcpy_fast(&buffer[curSize], pMem, size);
curSize += size; curSize += size;
} }

View File

@ -181,7 +181,7 @@ void SysMtgsThread::OpenPlugin()
{ {
if( m_PluginOpened ) return; if( m_PluginOpened ) return;
memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) ); memcpy( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
GSsetBaseMem( RingBuffer.Regs ); GSsetBaseMem( RingBuffer.Regs );
GSirqCallback( dummyIrqCallback ); GSirqCallback( dummyIrqCallback );
@ -626,7 +626,7 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
if (syncRegs) { if (syncRegs) {
ScopedLock lock(m_mtx_WaitGS); ScopedLock lock(m_mtx_WaitGS);
// Completely synchronize GS and MTGS register states. // Completely synchronize GS and MTGS register states.
memcpy_fast(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs)); memcpy(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
} }
} }

View File

@ -217,7 +217,7 @@ __fi u32 VU_Thread::Read()
__fi void VU_Thread::Read(void* dest, u32 size) __fi void VU_Thread::Read(void* dest, u32 size)
{ {
memcpy_fast(dest, &buffer[read_pos], size); memcpy(dest, &buffer[read_pos], size);
incReadPos(size_u32(size)); incReadPos(size_u32(size));
} }
@ -240,7 +240,7 @@ __fi void VU_Thread::Write(u32 val)
} }
__fi void VU_Thread::Write(void* src, u32 size) __fi void VU_Thread::Write(void* src, u32 size)
{ {
memcpy_fast(GetWritePtr(), src, size); memcpy(GetWritePtr(), src, size);
write_offset += size_u32(size); write_offset += size_u32(size);
} }

View File

@ -220,7 +220,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
// the transfer is most likely wrapped/partial. We need to queue it into a linear buffer // the transfer is most likely wrapped/partial. We need to queue it into a linear buffer
// and then send it on its way on the next copy. // and then send it on its way on the next copy.
memcpy_qwc( path1queue, src128, data ); memcpy( path1queue, src128, data*16);
path1size = data; path1size = data;
} }
else else
@ -235,7 +235,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
if (src128 == RingBuffer.m_Ring) if (src128 == RingBuffer.m_Ring)
{ {
pxAssert( (data+path1size) <= 0x400 ); pxAssert( (data+path1size) <= 0x400 );
memcpy_qwc( &path1queue[path1size], src128, data ); memcpy( &path1queue[path1size], src128, data*16);
path1size += data; path1size += data;
} }
GSgifTransfer1( (u32*)path1queue, 0 ); GSgifTransfer1( (u32*)path1queue, 0 );
@ -455,7 +455,7 @@ static s32 CALLBACK CDVD_getBuffer2(u8* buffer)
u8* pb = CDVD->getBuffer(); u8* pb = CDVD->getBuffer();
if(pb == NULL) return -2; if(pb == NULL) return -2;
memcpy_fast( buffer, pb, lastReadSize ); memcpy( buffer, pb, lastReadSize );
return 0; return 0;
} }

View File

@ -216,7 +216,7 @@ static int __Deci2Call(int call, u32 *addr)
pdeciaddr += (d2ptr[4]+0xc) % 16; pdeciaddr += (d2ptr[4]+0xc) % 16;
const int copylen = std::min<uint>(255, d2ptr[1]-0xc); const int copylen = std::min<uint>(255, d2ptr[1]-0xc);
memcpy_fast(deci2buffer, pdeciaddr, copylen ); memcpy(deci2buffer, pdeciaddr, copylen );
deci2buffer[copylen] = '\0'; deci2buffer[copylen] = '\0';
eeConLog( ShiftJIS_ConvertString(deci2buffer) ); eeConLog( ShiftJIS_ConvertString(deci2buffer) );

View File

@ -97,7 +97,7 @@ int _SPR0chain()
//Taking an arbitary small value for games which like to check the QWC/MADR instead of STR, so get most of //Taking an arbitary small value for games which like to check the QWC/MADR instead of STR, so get most of
//the cycle delay out of the way before the end. //the cycle delay out of the way before the end.
partialqwc = spr0ch.qwc; partialqwc = spr0ch.qwc;
memcpy_qwc(pMem, &psSu128(spr0ch.sadr), partialqwc); memcpy(pMem, &psSu128(spr0ch.sadr), partialqwc*16);
// clear VU mem also! // clear VU mem also!
TestClearVUs(spr0ch.madr, partialqwc, true); TestClearVUs(spr0ch.madr, partialqwc, true);
@ -151,7 +151,7 @@ void _SPR0interleave()
case MFD_RESERVED: case MFD_RESERVED:
// clear VU mem also! // clear VU mem also!
TestClearVUs(spr0ch.madr, spr0ch.qwc, true); TestClearVUs(spr0ch.madr, spr0ch.qwc, true);
memcpy_qwc(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc); memcpy(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc*16);
break; break;
} }
spr0ch.sadr += spr0ch.qwc * 16; spr0ch.sadr += spr0ch.qwc * 16;
@ -322,7 +322,7 @@ __fi static void SPR1transfer(const void* data, int qwc)
TestClearVUs(spr1ch.madr, spr1ch.qwc, false); TestClearVUs(spr1ch.madr, spr1ch.qwc, false);
} }
memcpy_qwc(&psSu128(spr1ch.sadr), data, qwc); memcpy(&psSu128(spr1ch.sadr), data, qwc*16);
spr1ch.sadr += qwc * 16; spr1ch.sadr += qwc * 16;
} }
@ -381,7 +381,7 @@ void _SPR1interleave()
spr1ch.qwc = std::min(tqwc, qwc); spr1ch.qwc = std::min(tqwc, qwc);
qwc -= spr1ch.qwc; qwc -= spr1ch.qwc;
pMem = SPRdmaGetAddr(spr1ch.madr, false); pMem = SPRdmaGetAddr(spr1ch.madr, false);
memcpy_qwc(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc); memcpy(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc*16);
spr1ch.sadr += spr1ch.qwc * 16; spr1ch.sadr += spr1ch.qwc * 16;
spr1ch.madr += (sqwc + spr1ch.qwc) * 16; spr1ch.madr += (sqwc + spr1ch.qwc) * 16;
} }

View File

@ -126,7 +126,7 @@ SaveStateBase& SaveStateBase::FreezeBios()
pxToUTF8 utf8(BiosDescription); pxToUTF8 utf8(BiosDescription);
memzero( biosdesc ); memzero( biosdesc );
memcpy_fast( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) ); memcpy( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) );
Freeze( bioscheck ); Freeze( bioscheck );
Freeze( biosdesc ); Freeze( biosdesc );
@ -282,7 +282,7 @@ void memSavingState::FreezeMem( void* data, int size )
if (!size) return; if (!size) return;
m_memory->MakeRoomFor( m_idx + size ); m_memory->MakeRoomFor( m_idx + size );
memcpy_fast( m_memory->GetPtr(m_idx), data, size ); memcpy( m_memory->GetPtr(m_idx), data, size );
m_idx += size; m_idx += size;
} }
@ -322,7 +322,7 @@ void memLoadingState::FreezeMem( void* data, int size )
{ {
const u8* const src = m_memory->GetPtr(m_idx); const u8* const src = m_memory->GetPtr(m_idx);
m_idx += size; m_idx += size;
memcpy_fast( data, src, size ); memcpy( data, src, size );
} }
// -------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------

View File

@ -53,8 +53,8 @@ struct sifFifo
const int wP0 = std::min((FIFO_SIF_W - writePos), words); const int wP0 = std::min((FIFO_SIF_W - writePos), words);
const int wP1 = words - wP0; const int wP1 = words - wP0;
memcpy_fast(&data[writePos], from, wP0 << 2); memcpy(&data[writePos], from, wP0 << 2);
memcpy_fast(&data[0], &from[wP0], wP1 << 2); memcpy(&data[0], &from[wP0], wP1 << 2);
writePos = (writePos + words) & (FIFO_SIF_W - 1); writePos = (writePos + words) & (FIFO_SIF_W - 1);
size += words; size += words;
@ -69,8 +69,8 @@ struct sifFifo
const int wP0 = std::min((FIFO_SIF_W - readPos), words); const int wP0 = std::min((FIFO_SIF_W - readPos), words);
const int wP1 = words - wP0; const int wP1 = words - wP0;
memcpy_fast(to, &data[readPos], wP0 << 2); memcpy(to, &data[readPos], wP0 << 2);
memcpy_fast(&to[wP0], &data[0], wP1 << 2); memcpy(&to[wP0], &data[0], wP1 << 2);
readPos = (readPos + words) & (FIFO_SIF_W - 1); readPos = (readPos + words) & (FIFO_SIF_W - 1);
size -= words; size -= words;

View File

@ -319,7 +319,7 @@ SIO_WRITE memcardErase(u8 data)
{ {
case 0x82: // Erase case 0x82: // Erase
//siomode = SIO_DUMMY; // Nothing more to do here. //siomode = SIO_DUMMY; // Nothing more to do here.
memcpy_fast(sio.buf, &header[1], 4); memcpy(sio.buf, &header[1], 4);
sio.bufSize = 3; sio.bufSize = 3;
mcd->EraseBlock(); mcd->EraseBlock();
break; break;
@ -367,7 +367,7 @@ SIO_WRITE memcardWrite(u8 data)
switch(data) switch(data)
{ {
case 0x42: // Write case 0x42: // Write
memcpy_fast(sio.buf, header, 4); memcpy(sio.buf, header, 4);
once = true; once = true;
break; break;
@ -375,7 +375,7 @@ SIO_WRITE memcardWrite(u8 data)
if(once) if(once)
{ {
siomode = SIO_DUMMY; // Nothing more to do here. siomode = SIO_DUMMY; // Nothing more to do here.
memcpy_fast(sio.buf, &header[1], 4); memcpy(sio.buf, &header[1], 4);
sio.bufSize = 3; sio.bufSize = 3;
sio2.packet.recvVal1 = 0x1600; // Writing sio2.packet.recvVal1 = 0x1600; // Writing
@ -454,7 +454,7 @@ SIO_WRITE memcardRead(u8 data)
switch(data) switch(data)
{ {
case 0x43: // Read case 0x43: // Read
memcpy_fast(sio.buf, header, 4); memcpy(sio.buf, header, 4);
once = true; once = true;
break; break;
@ -462,7 +462,7 @@ SIO_WRITE memcardRead(u8 data)
if(once) if(once)
{ {
siomode = SIO_DUMMY; // Nothing more to do here. siomode = SIO_DUMMY; // Nothing more to do here.
memcpy_fast(sio.buf, &header[1], 4); memcpy(sio.buf, &header[1], 4);
sio.bufSize = 3; sio.bufSize = 3;
sio2.packet.recvVal1 = 0x1700; // Reading sio2.packet.recvVal1 = 0x1700; // Reading
@ -624,7 +624,7 @@ SIO_WRITE sioWriteMemcard(u8 data)
cmd.mc_xor = info.Xor; cmd.mc_xor = info.Xor;
cmd.Z = mcd->term; cmd.Z = mcd->term;
memcpy_fast(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag)); memcpy(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag));
} }
break; break;
@ -698,7 +698,7 @@ SIO_WRITE sioWriteMemcardPSX(u8 data)
{ {
case 0x53: // PSX 'S'tate // haven't seen it happen yet case 0x53: // PSX 'S'tate // haven't seen it happen yet
sio.buf[1] = mcd->FLAG; sio.buf[1] = mcd->FLAG;
memcpy_fast(&sio.buf[2], memcard_psx, 8); memcpy(&sio.buf[2], memcard_psx, 8);
siomode = SIO_DUMMY; siomode = SIO_DUMMY;
break; break;

View File

@ -296,9 +296,9 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
if (!idx) CpuVU0->Clear(addr, (idx ? 0x4000 : 0x1000) - addr); if (!idx) CpuVU0->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
else CpuVU1->Clear(addr, (idx ? 0x4000 : 0x1000) - addr); else CpuVU1->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
memcpy_fast(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr); memcpy(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr);
size -= ((idx ? 0x4000 : 0x1000) - addr) / 4; size -= ((idx ? 0x4000 : 0x1000) - addr) / 4;
memcpy_fast(VUx.Micro, data, size); memcpy(VUx.Micro, data, size);
vifX.tag.addr = size * 4; vifX.tag.addr = size * 4;
} }
@ -310,7 +310,7 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
// Clear VU memory before writing! // Clear VU memory before writing!
if (!idx) CpuVU0->Clear(addr, size*4); if (!idx) CpuVU0->Clear(addr, size*4);
else CpuVU1->Clear(addr, size*4); else CpuVU1->Clear(addr, size*4);
memcpy_fast(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy_fast memcpy(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy
vifX.tag.addr += size * 4; vifX.tag.addr += size * 4;
} }

View File

@ -584,7 +584,7 @@ bool ConsoleLogFrame::Write( ConsoleColors color, const wxString& text )
int endpos = m_CurQueuePos + text.Length(); int endpos = m_CurQueuePos + text.Length();
m_QueueBuffer.MakeRoomFor( endpos + 1 ); // and the null!! m_QueueBuffer.MakeRoomFor( endpos + 1 ); // and the null!!
memcpy_fast( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() ); memcpy( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() );
m_CurQueuePos = endpos; m_CurQueuePos = endpos;
// this NULL may be overwritten if the next message sent doesn't perform a color change. // this NULL may be overwritten if the next message sent doesn't perform a color change.

View File

@ -126,17 +126,17 @@ namespace VU1micro
#endif #endif
runCount++; runCount++;
memcpy_const((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs)); memcpy((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs));
memcpy_const((u8*)backVUmem, (u8*)VU1.Mem, 0x4000); memcpy((u8*)backVUmem, (u8*)VU1.Mem, 0x4000);
do { // while loop needed since not always will return finished do { // while loop needed since not always will return finished
SuperVUExecuteProgram(VU1.VI[ REG_TPC ].UL & 0x3fff, 1); SuperVUExecuteProgram(VU1.VI[ REG_TPC ].UL & 0x3fff, 1);
} while( VU0.VI[ REG_VPU_STAT ].UL&0x100 ); } while( VU0.VI[ REG_VPU_STAT ].UL&0x100 );
memcpy_const((u8*)cmpVUregs, (u8*)&VU1, sizeof(VURegs)); memcpy((u8*)cmpVUregs, (u8*)&VU1, sizeof(VURegs));
memcpy_const((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000); memcpy((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000);
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs)); memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000); memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
//Currently breaking mVU execution is disabled. Check mVUtestCycles<vuIndex>() in microVU_Compile.inl //Currently breaking mVU execution is disabled. Check mVUtestCycles<vuIndex>() in microVU_Compile.inl
runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1); runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);
@ -227,8 +227,8 @@ namespace VU1micro
if (mVUdebugNow) { if (mVUdebugNow) {
resetVUrec(1); resetVUrec(1);
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs)); memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000); memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1); runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);

View File

@ -1016,8 +1016,8 @@ void SaveBranchState()
s_psaveInstInfo = g_pCurInstInfo; s_psaveInstInfo = g_pCurInstInfo;
// save all mmx regs // save all mmx regs
memcpy_const(s_saveMMXregs, mmxregs, sizeof(mmxregs)); memcpy(s_saveMMXregs, mmxregs, sizeof(mmxregs));
memcpy_const(s_saveXMMregs, xmmregs, sizeof(xmmregs)); memcpy(s_saveXMMregs, xmmregs, sizeof(xmmregs));
} }
void LoadBranchState() void LoadBranchState()
@ -1031,8 +1031,8 @@ void LoadBranchState()
g_pCurInstInfo = s_psaveInstInfo; g_pCurInstInfo = s_psaveInstInfo;
// restore all mmx regs // restore all mmx regs
memcpy_const(mmxregs, s_saveMMXregs, sizeof(mmxregs)); memcpy(mmxregs, s_saveMMXregs, sizeof(mmxregs));
memcpy_const(xmmregs, s_saveXMMregs, sizeof(xmmregs)); memcpy(xmmregs, s_saveXMMregs, sizeof(xmmregs));
} }
void iFlushCall(int flushtype) void iFlushCall(int flushtype)
@ -2179,7 +2179,7 @@ StartRecomp:
} }
} }
memcpy_fast(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc); memcpy(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc);
} }
s_pCurBlock->SetFnptr((uptr)recPtr); s_pCurBlock->SetFnptr((uptr)recPtr);

View File

@ -181,8 +181,8 @@ __ri microProgram* mVUcreateProg(microVU& mVU, int startPC) {
// Caches Micro Program // Caches Micro Program
__ri void mVUcacheProg(microVU& mVU, microProgram& prog) { __ri void mVUcacheProg(microVU& mVU, microProgram& prog) {
if (!mVU.index) memcpy_const(prog.data, mVU.regs().Micro, 0x1000); if (!mVU.index) memcpy(prog.data, mVU.regs().Micro, 0x1000);
else memcpy_const(prog.data, mVU.regs().Micro, 0x4000); else memcpy(prog.data, mVU.regs().Micro, 0x4000);
mVUdumpProg(mVU, prog); mVUdumpProg(mVU, prog);
} }

View File

@ -92,7 +92,7 @@ public:
blockEnd = blockList = newBlock; blockEnd = blockList = newBlock;
} }
memcpy_const(&newBlock->block, pBlock, sizeof(microBlock)); memcpy(&newBlock->block, pBlock, sizeof(microBlock));
thisBlock = &newBlock->block; thisBlock = &newBlock->block;
} }
return thisBlock; return thisBlock;

View File

@ -170,7 +170,7 @@ void normBranchCompile(microVU& mVU, u32 branchPC) {
} }
void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) { void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) {
memcpy_const(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo)); memcpy(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
mVUsetupBranch(mVU, mFC); mVUsetupBranch(mVU, mFC);
mVUbackupRegs(mVU); mVUbackupRegs(mVU);
@ -386,7 +386,7 @@ void condBranch(mV, microFlagCycles& mFC, int JMPcc) {
s32* ajmp = xJcc32((JccComparisonType)JMPcc); s32* ajmp = xJcc32((JccComparisonType)JMPcc);
u32 bPC = iPC; // mVUcompile can modify iPC, mVUpBlock, and mVUregs so back them up u32 bPC = iPC; // mVUcompile can modify iPC, mVUpBlock, and mVUregs so back them up
microBlock* pBlock = mVUpBlock; microBlock* pBlock = mVUpBlock;
memcpy_const(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo)); memcpy(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
incPC2(1); // Get PC for branch not-taken incPC2(1); // Get PC for branch not-taken
mVUcompile(mVU, xPC, (uptr)&mVUregs); mVUcompile(mVU, xPC, (uptr)&mVUregs);

View File

@ -427,10 +427,10 @@ __fi void mVUinitFirstPass(microVU& mVU, uptr pState, u8* thisPtr) {
mVU.p = 0; // All blocks start at p index #0 mVU.p = 0; // All blocks start at p index #0
mVU.q = 0; // All blocks start at q index #0 mVU.q = 0; // All blocks start at q index #0
if ((uptr)&mVUregs != pState) { // Loads up Pipeline State Info if ((uptr)&mVUregs != pState) { // Loads up Pipeline State Info
memcpy_const((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo)); memcpy((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo));
} }
if (doEarlyExit(mVU) && ((uptr)&mVU.prog.lpState != pState)) { if (doEarlyExit(mVU) && ((uptr)&mVU.prog.lpState != pState)) {
memcpy_const((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo)); memcpy((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
} }
mVUblock.x86ptrStart = thisPtr; mVUblock.x86ptrStart = thisPtr;
mVUpBlock = mVUblocks[mVUstartPC/2]->add(&mVUblock); // Add this block to block manager mVUpBlock = mVUblocks[mVUstartPC/2]->add(&mVUblock); // Add this block to block manager
@ -530,7 +530,7 @@ void mVUDoTBit(microVU& mVU, microFlagCycles* mFC)
void mVUSaveFlags(microVU& mVU,microFlagCycles &mFC, microFlagCycles &mFCBackup) void mVUSaveFlags(microVU& mVU,microFlagCycles &mFC, microFlagCycles &mFCBackup)
{ {
memcpy_fast(&mFCBackup, &mFC, sizeof(microFlagCycles)); memcpy(&mFCBackup, &mFC, sizeof(microFlagCycles));
mVUsetFlags(mVU, mFCBackup); // Sets Up Flag instances mVUsetFlags(mVU, mFCBackup); // Sets Up Flag instances
} }
void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) { void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {

View File

@ -86,7 +86,7 @@ public:
wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1) wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
); );
} }
memcpy_const(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T)); memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
} }
void clear() { void clear() {
for (int i = 0; i < hSize; i++) { for (int i = 0; i < hSize; i++) {

View File

@ -117,7 +117,7 @@ _vifT int nVifUnpack(const u8* data) {
if (ret == vif.tag.size) { // Full Transfer if (ret == vif.tag.size) { // Full Transfer
if (v.bSize) { // Last transfer was partial if (v.bSize) { // Last transfer was partial
memcpy_aligned(&v.buffer[v.bSize], data, size); memcpy(&v.buffer[v.bSize], data, size);
v.bSize += size; v.bSize += size;
size = v.bSize; size = v.bSize;
data = v.buffer; data = v.buffer;
@ -140,7 +140,7 @@ _vifT int nVifUnpack(const u8* data) {
v.bSize = 0; v.bSize = 0;
} }
else { // Partial Transfer else { // Partial Transfer
memcpy_aligned(&v.buffer[v.bSize], data, size); memcpy(&v.buffer[v.bSize], data, size);
v.bSize += size; v.bSize += size;
vif.tag.size -= ret; vif.tag.size -= ret;

View File

@ -131,15 +131,15 @@ void recSuperVU1::Execute(u32 cycles) {
#endif #endif
runCount++; runCount++;
memcpy_const((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs)); memcpy((u8*)backVUregs, (u8*)&VU1, sizeof(VURegs));
memcpy_const((u8*)backVUmem, (u8*) VU1.Mem, 0x4000); memcpy((u8*)backVUmem, (u8*) VU1.Mem, 0x4000);
runMVU1(cycles); runMVU1(cycles);
memcpy_const((u8*)cmpVUregs,(u8*)&VU1, sizeof(VURegs)); memcpy((u8*)cmpVUregs,(u8*)&VU1, sizeof(VURegs));
memcpy_const((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000); memcpy((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000);
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs)); memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000); memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
runSVU1(cycles); runSVU1(cycles);
if ((memcmp((u8*)cmpVUregs, (u8*)&VU1, (16*32) + (16*16))) || (memcmp((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000))) { if ((memcmp((u8*)cmpVUregs, (u8*)&VU1, (16*32) + (16*16))) || (memcmp((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000))) {
@ -230,8 +230,8 @@ void recSuperVU1::Execute(u32 cycles) {
resetMVU1(); resetMVU1();
memcpy_const((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs)); memcpy((u8*)&VU1, (u8*)backVUregs, sizeof(VURegs));
memcpy_const((u8*)VU1.Mem, (u8*)backVUmem, 0x4000); memcpy((u8*)VU1.Mem, (u8*)backVUmem, 0x4000);
runMVU1(cycles); runMVU1(cycles);

View File

@ -898,7 +898,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
#ifdef SUPERVU_CACHING #ifdef SUPERVU_CACHING
//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size); //memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
r.pmem = malloc(r.size); r.pmem = malloc(r.size);
memcpy_fast(r.pmem, &VU->Micro[r.start], r.size); memcpy(r.pmem, &VU->Micro[r.start], r.size);
#endif #endif
s_pFnHeader->ranges.push_back(r); s_pFnHeader->ranges.push_back(r);
} }

View File

@ -647,7 +647,6 @@ char *SysLibError(); // Gets previous error loading sysbols
void SysCloseLibrary(void *lib); // Closes Library void SysCloseLibrary(void *lib); // Closes Library
void SysMessage(char *fmt, ...); void SysMessage(char *fmt, ...);
extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n); extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);
template <typename T> template <typename T>

View File

@ -148,7 +148,6 @@
<ClCompile Include="Conf.cpp" /> <ClCompile Include="Conf.cpp" />
<ClCompile Include="..\GSmain.cpp" /> <ClCompile Include="..\GSmain.cpp" />
<ClCompile Include="..\Mem.cpp" /> <ClCompile Include="..\Mem.cpp" />
<ClCompile Include="..\memcpy_amd.cpp" />
<ClCompile Include="..\Regs.cpp" /> <ClCompile Include="..\Regs.cpp" />
<ClCompile Include="..\targets.cpp" /> <ClCompile Include="..\targets.cpp" />
<ClCompile Include="Win32.cpp" /> <ClCompile Include="Win32.cpp" />
@ -199,4 +198,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -24,9 +24,6 @@
<ClCompile Include="..\Mem.cpp"> <ClCompile Include="..\Mem.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\memcpy_amd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Regs.cpp"> <ClCompile Include="..\Regs.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
@ -94,4 +91,4 @@
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</CustomBuild> </CustomBuild>
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -154,7 +154,6 @@
<ClCompile Include="Conf.cpp" /> <ClCompile Include="Conf.cpp" />
<ClCompile Include="..\GSmain.cpp" /> <ClCompile Include="..\GSmain.cpp" />
<ClCompile Include="..\Mem.cpp" /> <ClCompile Include="..\Mem.cpp" />
<ClCompile Include="..\memcpy_amd.cpp" />
<ClCompile Include="..\Regs.cpp" /> <ClCompile Include="..\Regs.cpp" />
<ClCompile Include="..\targets.cpp" /> <ClCompile Include="..\targets.cpp" />
<ClCompile Include="Win32.cpp" /> <ClCompile Include="Win32.cpp" />
@ -205,4 +204,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -24,9 +24,6 @@
<ClCompile Include="..\Mem.cpp"> <ClCompile Include="..\Mem.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\memcpy_amd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Regs.cpp"> <ClCompile Include="..\Regs.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
@ -94,4 +91,4 @@
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</CustomBuild> </CustomBuild>
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -154,7 +154,6 @@
<ClCompile Include="Conf.cpp" /> <ClCompile Include="Conf.cpp" />
<ClCompile Include="..\GSmain.cpp" /> <ClCompile Include="..\GSmain.cpp" />
<ClCompile Include="..\Mem.cpp" /> <ClCompile Include="..\Mem.cpp" />
<ClCompile Include="..\memcpy_amd.cpp" />
<ClCompile Include="..\Regs.cpp" /> <ClCompile Include="..\Regs.cpp" />
<ClCompile Include="..\targets.cpp" /> <ClCompile Include="..\targets.cpp" />
<ClCompile Include="Win32.cpp" /> <ClCompile Include="Win32.cpp" />
@ -205,4 +204,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -24,9 +24,6 @@
<ClCompile Include="..\Mem.cpp"> <ClCompile Include="..\Mem.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="..\memcpy_amd.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\Regs.cpp"> <ClCompile Include="..\Regs.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
@ -94,4 +91,4 @@
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</CustomBuild> </CustomBuild>
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -1,479 +0,0 @@
/******************************************************************************
Copyright (c) 2001 Advanced Micro Devices, Inc.
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
NOT APPLY TO YOU.
AMD does not assume any responsibility for any errors which may appear in the
Materials nor any responsibility to support or update the Materials. AMD retains
the right to make changes to its test specifications at any time, without notice.
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
further information, software, technical information, know-how, or show-how
available to you.
So that all may benefit from your experience, please report any problems
or suggestions about this software to 3dsdk.support@amd.com
AMD Developer Technologies, M/S 585
Advanced Micro Devices, Inc.
5900 E. Ben White Blvd.
Austin, TX 78741
3dsdk.support@amd.com
******************************************************************************/
#include <assert.h>
/*****************************************************************************
MEMCPY_AMD.CPP
******************************************************************************/
// Very optimized memcpy() routine for AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetch instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization. This code uses
// the software prefetch instruction to get the data into the cache.
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
//#include <stddef.h>
// Inline assembly syntax for use with Visual C++
#ifdef _WIN32
#include <windows.h>
#endif
#include "PS2Etypes.h"
extern "C" {
#if defined(_MSC_VER) && !defined(__x86_64__)
void * memcpy_amd(void *dest, const void *src, size_t n)
{
__asm {
mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count
cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp ecx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ; note: the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1,[esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order to suppress HW prefetcher
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
mov eax, [dest] ; ret value = destination pointer
}
}
// mmx memcpy implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp
// (zerofrog)
u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
{
assert( (cmpsize&7) == 0 );
__asm {
push esi
mov ecx, cmpsize
mov edx, src1
mov esi, src2
cmp ecx, 32
jl Done4
// custom test first 8 to make sure things are ok
movq mm0, [esi]
movq mm1, [esi+8]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pand mm0, mm1
movq mm2, [esi+16]
pmovmskb eax, mm0
movq mm3, [esi+24]
// check if eq
cmp eax, 0xff
je NextComp
mov eax, 1
jmp End
NextComp:
pcmpeqd mm2, [edx+16]
pcmpeqd mm3, [edx+24]
pand mm2, mm3
pmovmskb eax, mm2
sub ecx, 32
add esi, 32
add edx, 32
// check if eq
cmp eax, 0xff
je ContinueTest
mov eax, 1
jmp End
cmp ecx, 64
jl Done8
Cmp8:
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pcmpeqd mm2, [edx+16]
pcmpeqd mm3, [edx+24]
pand mm0, mm1
pcmpeqd mm4, [edx+32]
pand mm0, mm2
pcmpeqd mm5, [edx+40]
pand mm0, mm3
pcmpeqd mm6, [edx+48]
pand mm0, mm4
pcmpeqd mm7, [edx+56]
pand mm0, mm5
pand mm0, mm6
pand mm0, mm7
pmovmskb eax, mm0
// check if eq
cmp eax, 0xff
je Continue
mov eax, 1
jmp End
Continue:
sub ecx, 64
add esi, 64
add edx, 64
ContinueTest:
cmp ecx, 64
jge Cmp8
Done8:
test ecx, 0x20
jz Done4
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pcmpeqd mm2, [edx+16]
pcmpeqd mm3, [edx+24]
pand mm0, mm1
pand mm0, mm2
pand mm0, mm3
pmovmskb eax, mm0
sub ecx, 32
add esi, 32
add edx, 32
// check if eq
cmp eax, 0xff
je Done4
mov eax, 1
jmp End
Done4:
cmp ecx, 24
jne Done2
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pcmpeqd mm2, [edx+16]
pand mm0, mm1
pand mm0, mm2
pmovmskb eax, mm0
// check if eq
cmp eax, 0xff
setne al
jmp End
Done2:
cmp ecx, 16
jne Done1
movq mm0, [esi]
movq mm1, [esi+8]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pand mm0, mm1
pmovmskb eax, mm0
// check if eq
cmp eax, 0xff
setne al
jmp End
Done1:
cmp ecx, 8
jne Done
mov eax, [esi]
mov esi, [esi+4]
cmp eax, [edx]
je Next
mov eax, 1
jmp End
Next:
cmp esi, [edx+4]
setne al
jmp End
Done:
xor eax, eax
End:
pop esi
emms
}
}
#else // _MSC_VER
// assume gcc or mingw or win x64
#include <memory.h>
#include <string.h>
void * memcpy_amd(void *dest, const void *src, size_t n)
{
memcpy(dest, src, n);
return dest;
}
#endif
}

View File

@ -2026,7 +2026,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
targ->clut.resize(clutsize); targ->clut.resize(clutsize);
if( tex0.cpsm <= 1 ) { // 32 bit if( tex0.cpsm <= 1 ) { // 32 bit
memcpy_amd(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize); memcpy(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
} }
else { else {
u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset); u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
@ -2110,7 +2110,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
targ->memory->ref = 1; targ->memory->ref = 1;
} }
memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
u8* psrc = (u8*)(ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy); u8* psrc = (u8*)(ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy);
@ -2136,7 +2136,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
targ->memory->ref = 1; targ->memory->ref = 1;
} }
memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
// needs to be 8 bit, use xmm for unpacking // needs to be 8 bit, use xmm for unpacking
u16* dst = (u16*)lock.pBits; u16* dst = (u16*)lock.pBits;
@ -2219,7 +2219,7 @@ Z16Loop:
targ->memory = NULL; targ->memory = NULL;
} }
memcpy_amd(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height ); memcpy(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height );
} }
} }

View File

@ -2239,7 +2239,7 @@ void ZeroGS::Flush(int context)
} }
if( curvb.tex0.cpsm <= 1 ) { // 32 bit if( curvb.tex0.cpsm <= 1 ) { // 32 bit
memcpy_amd(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize); memcpy(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
} }
else { else {
u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset); u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
@ -5087,7 +5087,7 @@ void ZeroGS::CaptureFrame()
BYTE* pend = (BYTE*)lock.pBits + (conf.height-1)*width*4; BYTE* pend = (BYTE*)lock.pBits + (conf.height-1)*width*4;
for(int i = 0; i < conf.height; ++i) { for(int i = 0; i < conf.height; ++i) {
memcpy_amd(&mem[width*4*i], pend - width*4*i, width * 4); memcpy(&mem[width*4*i], pend - width*4*i, width * 4);
} }
s_ptexAVICapture->UnlockRect(); s_ptexAVICapture->UnlockRect();

View File

@ -36,7 +36,6 @@ set(zerogsSources
GSmain.cpp GSmain.cpp
GLWinX11.cpp GLWinX11.cpp
Mem.cpp Mem.cpp
memcpy_amd.cpp
rasterfont.cpp rasterfont.cpp
Regs.cpp Regs.cpp
targets.cpp targets.cpp

View File

@ -728,7 +728,6 @@ char *SysLibError(); // Gets previous error loading sysbols
void SysCloseLibrary(void *lib); // Closes Library void SysCloseLibrary(void *lib); // Closes Library
void SysMessage(char *fmt, ...); void SysMessage(char *fmt, ...);
extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n); extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);
template <typename T> template <typename T>

View File

@ -23,7 +23,7 @@ libZeroGSogl_LDFLAGS+=-Wl,-soname,@ZEROGS_SONAME@
libZeroGSogl_LDADD=$(libZeroGSogl_a_OBJECTS) libZeroGSogl_LDADD=$(libZeroGSogl_a_OBJECTS)
libZeroGSogl_a_SOURCES = \ libZeroGSogl_a_SOURCES = \
GSmain.cpp memcpy_amd.cpp Regs.cpp x86.cpp zpipe.cpp \ GSmain.cpp Regs.cpp x86.cpp zpipe.cpp \
Mem.cpp rasterfont.cpp targets.cpp zerogs.cpp GifTransfer.cpp GLWinX11.cpp Mem.cpp rasterfont.cpp targets.cpp zerogs.cpp GifTransfer.cpp GLWinX11.cpp
libZeroGSogl_a_SOURCES += x86-32.S libZeroGSogl_a_SOURCES += x86-32.S

View File

@ -1,478 +0,0 @@
/******************************************************************************
Copyright (c) 2001 Advanced Micro Devices, Inc.
LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
NOT APPLY TO YOU.
AMD does not assume any responsibility for any errors which may appear in the
Materials nor any responsibility to support or update the Materials. AMD retains
the right to make changes to its test specifications at any time, without notice.
NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
further information, software, technical information, know-how, or show-how
available to you.
So that all may benefit from your experience, please report any problems
or suggestions about this software to 3dsdk.support@amd.com
AMD Developer Technologies, M/S 585
Advanced Micro Devices, Inc.
5900 E. Ben White Blvd.
Austin, TX 78741
3dsdk.support@amd.com
******************************************************************************/
#include <assert.h>
/*****************************************************************************
MEMCPY_AMD.CPP
******************************************************************************/
// Very optimized memcpy() routine for AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetch instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".
#define IN_CACHE_COPY 2 * 1024 // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization. This code uses
// the software prefetch instruction to get the data into the cache.
#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
//#include <stddef.h>
// Inline assembly syntax for use with Visual C++
#ifdef _WIN32
#include <windows.h>
#endif
extern "C" {
#include "PS2Etypes.h"
#if defined(_MSC_VER)
void * memcpy_amd(void *dest, const void *src, size_t n)
{
__asm {
mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count
cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp ecx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ; note: the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1,[esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order to suppress HW prefetcher
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
mov eax, [dest] ; ret value = destination pointer
}
}
// mmx memcpy implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp
// (zerofrog)
u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
{
assert( (cmpsize&7) == 0 );
__asm {
push esi
mov ecx, cmpsize
mov edx, src1
mov esi, src2
cmp ecx, 32
jl Done4
// custom test first 8 to make sure things are ok
movq mm0, [esi]
movq mm1, [esi+8]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pand mm0, mm1
movq mm2, [esi+16]
pmovmskb eax, mm0
movq mm3, [esi+24]
// check if eq
cmp eax, 0xff
je NextComp
mov eax, 1
jmp End
NextComp:
pcmpeqd mm2, [edx+16]
pcmpeqd mm3, [edx+24]
pand mm2, mm3
pmovmskb eax, mm2
sub ecx, 32
add esi, 32
add edx, 32
// check if eq
cmp eax, 0xff
je ContinueTest
mov eax, 1
jmp End
cmp ecx, 64
jl Done8
Cmp8:
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pcmpeqd mm2, [edx+16]
pcmpeqd mm3, [edx+24]
pand mm0, mm1
pcmpeqd mm4, [edx+32]
pand mm0, mm2
pcmpeqd mm5, [edx+40]
pand mm0, mm3
pcmpeqd mm6, [edx+48]
pand mm0, mm4
pcmpeqd mm7, [edx+56]
pand mm0, mm5
pand mm0, mm6
pand mm0, mm7
pmovmskb eax, mm0
// check if eq
cmp eax, 0xff
je Continue
mov eax, 1
jmp End
Continue:
sub ecx, 64
add esi, 64
add edx, 64
ContinueTest:
cmp ecx, 64
jge Cmp8
Done8:
test ecx, 0x20
jz Done4
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
movq mm3, [esi+24]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pcmpeqd mm2, [edx+16]
pcmpeqd mm3, [edx+24]
pand mm0, mm1
pand mm0, mm2
pand mm0, mm3
pmovmskb eax, mm0
sub ecx, 32
add esi, 32
add edx, 32
// check if eq
cmp eax, 0xff
je Done4
mov eax, 1
jmp End
Done4:
cmp ecx, 24
jne Done2
movq mm0, [esi]
movq mm1, [esi+8]
movq mm2, [esi+16]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pcmpeqd mm2, [edx+16]
pand mm0, mm1
pand mm0, mm2
pmovmskb eax, mm0
// check if eq
cmp eax, 0xff
setne al
jmp End
Done2:
cmp ecx, 16
jne Done1
movq mm0, [esi]
movq mm1, [esi+8]
pcmpeqd mm0, [edx]
pcmpeqd mm1, [edx+8]
pand mm0, mm1
pmovmskb eax, mm0
// check if eq
cmp eax, 0xff
setne al
jmp End
Done1:
cmp ecx, 8
jne Done
mov eax, [esi]
mov esi, [esi+4]
cmp eax, [edx]
je Next
mov eax, 1
jmp End
Next:
cmp esi, [edx+4]
setne al
jmp End
Done:
xor eax, eax
End:
pop esi
emms
}
}
#else // _MSC_VER
// assume gcc
#include <memory.h>
#include <string.h>
void * memcpy_amd(void *dest, const void *src, size_t n)
{
memcpy(dest, src, n);
return dest;
}
#endif
}

View File

@ -1789,7 +1789,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
targ->clut.resize(clutsize); targ->clut.resize(clutsize);
if( tex0.cpsm <= 1 ) { // 32 bit if( tex0.cpsm <= 1 ) { // 32 bit
memcpy_amd(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize); memcpy(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize);
} }
else { else {
u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset); u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
@ -1854,7 +1854,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
assert(targ->ptex->ref > 0 ); assert(targ->ptex->ref > 0 );
} }
memcpy_amd(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height); memcpy(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
vector<u8> texdata; vector<u8> texdata;
u8* ptexdata = NULL; u8* ptexdata = NULL;

View File

@ -2568,7 +2568,7 @@ void ZeroGS::Flush(int context)
g_nCurVBOIndex = (g_nCurVBOIndex+1)%g_vboBuffers.size(); g_nCurVBOIndex = (g_nCurVBOIndex+1)%g_vboBuffers.size();
glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW); glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
// void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); // void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
// memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU)); // memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
// glUnmapBuffer(GL_ARRAY_BUFFER); // glUnmapBuffer(GL_ARRAY_BUFFER);
SET_STREAM(); SET_STREAM();
@ -2652,7 +2652,7 @@ void ZeroGS::Flush(int context)
} }
if( curvb.tex0.cpsm <= 1 ) { // 32 bit if( curvb.tex0.cpsm <= 1 ) { // 32 bit
memcpy_amd(&data[0], g_pbyGSClut+nClutOffset, clutsize); memcpy(&data[0], g_pbyGSClut+nClutOffset, clutsize);
} }
else { else {
u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset); u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
@ -5779,7 +5779,7 @@ void ZeroGS::CaptureFrame()
// u8* pend = (u8*)&data[0] + (nBackbufferHeight-1)*nBackbufferWidth*4; // u8* pend = (u8*)&data[0] + (nBackbufferHeight-1)*nBackbufferWidth*4;
// for(int i = 0; i < conf.height; ++i) { // for(int i = 0; i < conf.height; ++i) {
// memcpy_amd(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4); // memcpy(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4);
// } // }
int fps = SMODE1->CMOD == 3 ? 50 : 60; int fps = SMODE1->CMOD == 3 ? 50 : 60;

View File

@ -436,7 +436,7 @@ namespace ZeroGS {
if( nCount + nVerts > nNumVertices ) { if( nCount + nVerts > nNumVertices ) {
// recreate except with a bigger count // recreate except with a bigger count
VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU)*nNumVertices*2, 256); VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU)*nNumVertices*2, 256);
memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount); memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
nNumVertices *= 2; nNumVertices *= 2;
assert( nCount + nVerts <= nNumVertices ); assert( nCount + nVerts <= nNumVertices );
_aligned_free(pBufferData); _aligned_free(pBufferData);

View File

@ -55,7 +55,6 @@ set(zzoglSources
GSmain.cpp GSmain.cpp
HostMemory.cpp HostMemory.cpp
Mem.cpp Mem.cpp
# memcpy_amd.cpp
Mem_Swizzle.cpp Mem_Swizzle.cpp
Mem_Tables.cpp Mem_Tables.cpp
Profile.cpp Profile.cpp

View File

@ -68,7 +68,6 @@ extern "C" char* CALLBACK PS2EgetLibName(void);
#include "GSDump.h" #include "GSDump.h"
#include "Utilities/MemcpyFast.h" #include "Utilities/MemcpyFast.h"
#define memcpy_amd memcpy_fast
extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path

View File

@ -493,7 +493,7 @@ template <>
/*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize) /*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
{ {
u8* clut = (u8*)GetClutBufferAddress<u32>(csa); u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
memcpy_amd((u8*)dst, clut, clutsize); memcpy((u8*)dst, clut, clutsize);
} }
template <> template <>

View File

@ -657,7 +657,7 @@ inline void FlushSetStream(VB& curvb)
g_nCurVBOIndex = (g_nCurVBOIndex + 1) % g_vboBuffers.size(); g_nCurVBOIndex = (g_nCurVBOIndex + 1) % g_vboBuffers.size();
glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW); glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
// void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); // void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
// memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU)); // memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
// glUnmapBuffer(GL_ARRAY_BUFFER); // glUnmapBuffer(GL_ARRAY_BUFFER);
SET_STREAM(); SET_STREAM();

View File

@ -89,7 +89,7 @@ class VB
assert(pBufferData != NULL); assert(pBufferData != NULL);
nNumVertices *= 2; nNumVertices *= 2;
VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256); VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount); memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
assert(nCount <= nNumVertices); assert(nCount <= nNumVertices);
_aligned_free(pBufferData); _aligned_free(pBufferData);
pBufferData = ptemp; pBufferData = ptemp;

View File

@ -1979,7 +1979,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
assert(targ->ptex->ref > 0); assert(targ->ptex->ref > 0);
} }
memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height)); memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
__aligned16 u8* ptexdata = NULL; __aligned16 u8* ptexdata = NULL;
bool has_data = false; bool has_data = false;

View File

@ -86,7 +86,6 @@ typedef signed long long int64;
#include "GSDump.h" #include "GSDump.h"
#include "Utilities/MemcpyFast.h" #include "Utilities/MemcpyFast.h"
#define memcpy_amd memcpy_fast
extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path

View File

@ -489,7 +489,7 @@ template <>
/*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize) /*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
{ {
u8* clut = (u8*)GetClutBufferAddress<u32>(csa); u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
memcpy_amd((u8*)dst, clut, clutsize); memcpy((u8*)dst, clut, clutsize);
} }
template <> template <>

View File

@ -364,7 +364,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
assert(targ->ptex->ref > 0); assert(targ->ptex->ref > 0);
} }
memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height)); memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
__aligned16 u8* ptexdata = NULL; __aligned16 u8* ptexdata = NULL;
bool has_data = false; bool has_data = false;

View File

@ -535,7 +535,7 @@ inline void FlushSetStream(VB& curvb)
// void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY); // void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
// memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU)); // memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
// glUnmapBuffer(GL_ARRAY_BUFFER); // glUnmapBuffer(GL_ARRAY_BUFFER);
SET_STREAM(); SET_STREAM();

View File

@ -89,7 +89,7 @@ class VB
assert(pBufferData != NULL); assert(pBufferData != NULL);
nNumVertices *= 2; nNumVertices *= 2;
VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256); VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount); memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
assert(nCount <= nNumVertices); assert(nCount <= nNumVertices);
_aligned_free(pBufferData); _aligned_free(pBufferData);
pBufferData = ptemp; pBufferData = ptemp;