Merge pull request #215 from xsacha/memcpy

Remove some slow, redundant memcpy implementations: memcpy_const/memcpy_...
2014-09-12 19:57:57 +02:00 · 2014-09-12 19:57:57 +02:00 · 98d22f8b2e
parent b9e6024fe9 1d116cc23b
commit 98d22f8b2e
58 changed files with 116 additions and 1092 deletions
--- a/common/include/Utilities/SafeArray.inl
+++ b/common/include/Utilities/SafeArray.inl
@ -118,7 +118,7 @@ template< typename T >
 SafeArray<T>* SafeArray<T>::Clone() const
 {
 	SafeArray<T>* retval = new SafeArray<T>( m_size );
-	memcpy_fast( retval->GetPtr(), m_ptr, sizeof(T) * m_size );
+	memcpy( retval->GetPtr(), m_ptr, sizeof(T) * m_size );
 	return retval;
 }
@ -160,7 +160,7 @@ template< typename T, uint Alignment >
 SafeAlignedArray<T,Alignment>* SafeAlignedArray<T,Alignment>::Clone() const
 {
 	SafeAlignedArray<T,Alignment>* retval = new SafeAlignedArray<T,Alignment>( this->m_size );
-	memcpy_fast( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size );
+	memcpy( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size );
 	return retval;
 }
@ -272,14 +272,14 @@ void SafeList<T>::Remove( int index )
 	int copylen = m_length - index;
 	if( copylen > 0 )
-		memcpy_fast( &m_ptr[index], &m_ptr[index+1], copylen );
+		memcpy( &m_ptr[index], &m_ptr[index+1], copylen );
 }
 template< typename T >
 SafeList<T>* SafeList<T>::Clone() const
 {
 	SafeList<T>* retval = new SafeList<T>( m_length );
-	memcpy_fast( retval->m_ptr, m_ptr, sizeof(T) * m_length );
+	memcpy( retval->m_ptr, m_ptr, sizeof(T) * m_length );
 	return retval;
 }
--- a/common/src/Utilities/FastFormatString.cpp
+++ b/common/src/Utilities/FastFormatString.cpp
@ -265,7 +265,7 @@ FastFormatUnicode& FastFormatUnicode::WriteV( const char* fmt, va_list argptr )
 	const uint inspos = m_Length;
 	const uint convLen = converted.Length();
 	m_dest->MakeRoomFor((inspos + convLen + 64) * sizeof(wxChar));
-	memcpy_fast( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) );
+	memcpy( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) );
 	m_Length += convLen;
 	return *this;
--- a/common/src/x86emitter/jmp.cpp
+++ b/common/src/x86emitter/jmp.cpp
@ -60,7 +60,7 @@ void xSmartJump::SetTarget()
 		u8* destpos = xGetPtr();
 		const int copylen = (sptr)target - (sptr)saveme;
-		memcpy_fast( destpos, saveme, copylen );
+		memcpy( destpos, saveme, copylen );
 		xSetPtr( target - spacer );
 	}
 }
--- a/pcsx2/CDVD/CDVD.cpp
+++ b/pcsx2/CDVD/CDVD.cpp
@ -725,7 +725,7 @@ int cdvdReadSector() {
 		mdest[11] = 0;
 		// normal 2048 bytes of sector data
-		memcpy_const(&mdest[12], cdr.Transfer, 2048);
+		memcpy(&mdest[12], cdr.Transfer, 2048);
 		// 4 bytes of edc (not calculated at present)
 		mdest[2060] = 0;
@ -735,7 +735,7 @@ int cdvdReadSector() {
 	}
 	else
 	{
-		memcpy_fast( mdest, cdr.Transfer, cdvd.BlockSize);
+		memcpy( mdest, cdr.Transfer, cdvd.BlockSize);
 	}
 	// decrypt sector's bytes
@ -1567,7 +1567,7 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 				cdvd.Param[cdvd.ParamP-5], cdvd.Param[cdvd.ParamP-3], cdvd.Param[cdvd.ParamP-2], cdvd.Param[cdvd.ParamP-1]);
 			Console.WriteLn("RTC Write Sec %d Min %d Hr %d Day %d Month %d Year %d", cdvd.RTC.second, cdvd.RTC.minute,
 				cdvd.RTC.hour, cdvd.RTC.day, cdvd.RTC.month, cdvd.RTC.year);*/
-			//memcpy_fast((u8*)&cdvd.RTC, cdvd.Param, 7);
+			//memcpy((u8*)&cdvd.RTC, cdvd.Param, 7);
 			break;
 		case 0x0A: // sceCdReadNVM (2:3)
@ -1907,7 +1907,7 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 			}
 			else
 			{
-				memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
+				memcpy(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
 				cdvd.mg_size += cdvd.ParamC;
 				cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
 			}
@ -1915,9 +1915,9 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 		case 0x8E: // sceMgReadData
 			SetResultSize( std::min(16, cdvd.mg_size) );
-			memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
+			memcpy(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
 			cdvd.mg_size -= cdvd.ResultC;
-			memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
+			memcpy(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
 			break;
 		case 0x88: // secrman: __mechacon_auth_0x88	//for now it is the same; so, fall;)
@ -1984,7 +1984,7 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 		{
 			SetResultSize(3);//in:0
 			int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
-			memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
+			memcpy(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
 			cdvd.mg_maxsize = 0; // don't allow any write
 			cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
--- a/pcsx2/CDVD/CDVDisoReader.cpp
+++ b/pcsx2/CDVD/CDVDisoReader.cpp
@ -422,7 +422,7 @@ s32 CALLBACK ISOreadSector(u8* tempbuffer, u32 lsn, int mode)
 		jNO_DEFAULT
 	}
-	memcpy_fast(tempbuffer, pbuffer, psize);
+	memcpy(tempbuffer, pbuffer, psize);
 	return 0;
 }
--- a/pcsx2/CDVD/CdRom.cpp
+++ b/pcsx2/CDVD/CdRom.cpp
@ -917,7 +917,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
 			}
 			cdsize = (bcr & 0xffff) * 4;
-			memcpy_fast(iopPhysMem(madr), cdr.pTransfer, cdsize);
+			memcpy(iopPhysMem(madr), cdr.pTransfer, cdsize);
 			psxCpu->Clear(madr, cdsize/4);
 			cdr.pTransfer+=cdsize;
@ -947,7 +947,7 @@ s32 CALLBACK cdvdDmaRead(s32 channel, u32* data, u32 bytesLeft, u32* bytesProces
 		return 10000;
 	}
-	memcpy_fast(data, cdr.pTransfer, wordsLeft);
+	memcpy(data, cdr.pTransfer, wordsLeft);
 	//psxCpu->Clear(madr, cdsize/4);
 	cdr.pTransfer+=wordsLeft;
 	*wordsProcessed = wordsLeft;
--- a/pcsx2/CDVD/InputIsoFile.cpp
+++ b/pcsx2/CDVD/InputIsoFile.cpp
@ -145,7 +145,7 @@ int InputIsoFile::FinishRead3(u8* dst, uint mode)
 	length = end - _offset;
 	uint read_offset = (m_current_lsn - m_read_lsn) * m_blocksize;
-	memcpy_fast(dst + diff, m_readbuffer + ndiff + read_offset, length);
+	memcpy(dst + diff, m_readbuffer + ndiff + read_offset, length);
 	if (m_type == ISOTYPE_CD && diff >= 12)
 	{
--- a/pcsx2/CDVD/IsoFS/IsoFile.cpp
+++ b/pcsx2/CDVD/IsoFS/IsoFile.cpp
@ -161,7 +161,7 @@ int IsoFile::internalRead(void* dest, int off, int len)
 			slen = (int) (maxOffset - currentOffset);
 		}
-		memcpy_fast((u8*)dest + off, currentSector + sectorOffset, slen);
+		memcpy((u8*)dest + off, currentSector + sectorOffset, slen);
 		sectorOffset += slen;
 		currentOffset += slen;
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@ -442,27 +442,27 @@ extern __aligned(32) MTGS_BufferedData RingBuffer;
 inline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) {
 	uint endpos = destStart + len;
 	if ( endpos < destSize ) {
-		memcpy_qwc(&destBase[destStart], src, len );
+		memcpy(&destBase[destStart], src, len*16);
 		destStart += len;
 	}
 	else {
 		uint firstcopylen = destSize - destStart;
-		memcpy_qwc(&destBase[destStart], src, firstcopylen );
+		memcpy(&destBase[destStart], src, firstcopylen*16);
 		destStart = endpos % destSize;
-		memcpy_qwc(destBase, src+firstcopylen, destStart );
+		memcpy(destBase, src+firstcopylen, destStart*16);
 	}
 }
 inline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) {
 	uint endpos = srcStart + len;
 	if ( endpos < srcSize ) {
-		memcpy_qwc(dest, &srcBase[srcStart], len );
+		memcpy(dest, &srcBase[srcStart], len*16);
 		srcStart += len;
 	}
 	else {
 		uint firstcopylen = srcSize - srcStart;
-		memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
+		memcpy(dest, &srcBase[srcStart], firstcopylen*16);
 		srcStart = endpos % srcSize;
-		memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
+		memcpy(dest+firstcopylen, srcBase, srcStart*16);
 	}
 }
--- a/pcsx2/Gif_Unit.h
+++ b/pcsx2/Gif_Unit.h
@ -222,7 +222,7 @@ struct Gif_Path {
 		}
 		//DevCon.WriteLn("Realign Packet [%d]", curSize - offset);
 		if (intersect) memmove(buffer, &buffer[offset], curSize - offset);
-		else       memcpy_fast(buffer, &buffer[offset], curSize - offset);
+		else       memcpy(buffer, &buffer[offset], curSize - offset);
 		curSize      -= offset;
 		curOffset     = gsPack.size;
 		gsPack.offset = 0;
@ -241,8 +241,7 @@ struct Gif_Path {
 			mtgsReadWait(); // Let MTGS run to free up buffer space
 		}
 		pxAssertDev(curSize+size<=buffSize, "Gif Path Buffer Overflow!");
-		if (aligned) memcpy_qwc (&buffer[curSize], pMem, size/16);
+		memcpy (&buffer[curSize], pMem, size);
 		else		 memcpy_fast(&buffer[curSize], pMem, size);
 		curSize     += size;
 	}
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -181,7 +181,7 @@ void SysMtgsThread::OpenPlugin()
 {
 	if( m_PluginOpened ) return;
-	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
+	memcpy( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
 	GSsetBaseMem( RingBuffer.Regs );
 	GSirqCallback( dummyIrqCallback );
@ -626,7 +626,7 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 	if (syncRegs) {
 		ScopedLock lock(m_mtx_WaitGS);
 		// Completely synchronize GS and MTGS register states.
-		memcpy_fast(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
+		memcpy(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
 	}
 }
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@ -217,7 +217,7 @@ __fi u32 VU_Thread::Read()
 __fi void VU_Thread::Read(void* dest, u32 size)
 {
-	memcpy_fast(dest, &buffer[read_pos], size);
+	memcpy(dest, &buffer[read_pos], size);
 	incReadPos(size_u32(size));
 }
@ -240,7 +240,7 @@ __fi void VU_Thread::Write(u32 val)
 }
 __fi void VU_Thread::Write(void* src, u32 size)
 {
-	memcpy_fast(GetWritePtr(), src, size);
+	memcpy(GetWritePtr(), src, size);
 	write_offset += size_u32(size);
 }
--- a/pcsx2/PluginManager.cpp
+++ b/pcsx2/PluginManager.cpp
@ -220,7 +220,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
 		// the transfer is most likely wrapped/partial.  We need to queue it into a linear buffer
 		// and then send it on its way on the next copy.
-		memcpy_qwc( path1queue, src128, data );
+		memcpy( path1queue, src128, data*16);
 		path1size = data;
 	}
 	else
@ -235,7 +235,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
 			if (src128 == RingBuffer.m_Ring)
 			{
 				pxAssert( (data+path1size) <= 0x400 );
-				memcpy_qwc( &path1queue[path1size], src128, data );
+				memcpy( &path1queue[path1size], src128, data*16);
 				path1size += data;
 			}
 			GSgifTransfer1( (u32*)path1queue, 0 );
@ -455,7 +455,7 @@ static s32 CALLBACK CDVD_getBuffer2(u8* buffer)
 	u8* pb = CDVD->getBuffer();
 	if(pb == NULL) return -2;
-	memcpy_fast( buffer, pb, lastReadSize );
+	memcpy( buffer, pb, lastReadSize );
 	return 0;
 }
--- a/pcsx2/R5900OpcodeImpl.cpp
+++ b/pcsx2/R5900OpcodeImpl.cpp
@ -216,7 +216,7 @@ static int __Deci2Call(int call, u32 *addr)
 					pdeciaddr += (d2ptr[4]+0xc) % 16;
 				const int copylen = std::min<uint>(255, d2ptr[1]-0xc);
-				memcpy_fast(deci2buffer, pdeciaddr, copylen );
+				memcpy(deci2buffer, pdeciaddr, copylen );
 				deci2buffer[copylen] = '\0';
 				eeConLog( ShiftJIS_ConvertString(deci2buffer) );
--- a/pcsx2/SPR.cpp
+++ b/pcsx2/SPR.cpp
@ -97,7 +97,7 @@ int  _SPR0chain()
 			//Taking an arbitary small value for games which like to check the QWC/MADR instead of STR, so get most of
 			//the cycle delay out of the way before the end.
 			partialqwc = spr0ch.qwc;
-			memcpy_qwc(pMem, &psSu128(spr0ch.sadr), partialqwc);
+			memcpy(pMem, &psSu128(spr0ch.sadr), partialqwc*16);
 			// clear VU mem also!
 			TestClearVUs(spr0ch.madr, partialqwc, true);
@ -151,7 +151,7 @@ void _SPR0interleave()
 			case MFD_RESERVED:
 				// clear VU mem also!
 				TestClearVUs(spr0ch.madr, spr0ch.qwc, true);
-				memcpy_qwc(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc);
+				memcpy(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc*16);
 				break;
 		}
 		spr0ch.sadr += spr0ch.qwc * 16;
@ -322,7 +322,7 @@ __fi static void SPR1transfer(const void* data, int qwc)
 		TestClearVUs(spr1ch.madr, spr1ch.qwc, false);
 	}
-	memcpy_qwc(&psSu128(spr1ch.sadr), data, qwc);
+	memcpy(&psSu128(spr1ch.sadr), data, qwc*16);
 	spr1ch.sadr += qwc * 16;
 }
@ -381,7 +381,7 @@ void _SPR1interleave()
 		spr1ch.qwc = std::min(tqwc, qwc);
 		qwc -= spr1ch.qwc;
 		pMem = SPRdmaGetAddr(spr1ch.madr, false);
-		memcpy_qwc(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc);
+		memcpy(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc*16);
 		spr1ch.sadr += spr1ch.qwc * 16;
 		spr1ch.madr += (sqwc + spr1ch.qwc) * 16;
 	}
--- a/pcsx2/SaveState.cpp
+++ b/pcsx2/SaveState.cpp
@ -126,7 +126,7 @@ SaveStateBase& SaveStateBase::FreezeBios()
 	pxToUTF8 utf8(BiosDescription);
 	memzero( biosdesc );
-	memcpy_fast( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) );
+	memcpy( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) );
 	Freeze( bioscheck );
 	Freeze( biosdesc );
@ -282,7 +282,7 @@ void memSavingState::FreezeMem( void* data, int size )
 	if (!size) return;
 	m_memory->MakeRoomFor( m_idx + size );
-	memcpy_fast( m_memory->GetPtr(m_idx), data, size );
+	memcpy( m_memory->GetPtr(m_idx), data, size );
 	m_idx += size;
 }
@ -322,7 +322,7 @@ void memLoadingState::FreezeMem( void* data, int size )
 {
 	const u8* const src = m_memory->GetPtr(m_idx);
 	m_idx += size;
-	memcpy_fast( data, src, size );
+	memcpy( data, src, size );
 }
 // --------------------------------------------------------------------------------------
--- a/pcsx2/Sif.h
+++ b/pcsx2/Sif.h
@ -53,8 +53,8 @@ struct sifFifo
 			const int wP0 = std::min((FIFO_SIF_W - writePos), words);
 			const int wP1 = words - wP0;
-			memcpy_fast(&data[writePos], from, wP0 << 2);
+			memcpy(&data[writePos], from, wP0 << 2);
-			memcpy_fast(&data[0], &from[wP0], wP1 << 2);
+			memcpy(&data[0], &from[wP0], wP1 << 2);
 			writePos = (writePos + words) & (FIFO_SIF_W - 1);
 			size += words;
@ -69,8 +69,8 @@ struct sifFifo
 			const int wP0 = std::min((FIFO_SIF_W - readPos), words);
 			const int wP1 = words - wP0;
-			memcpy_fast(to, &data[readPos], wP0 << 2);
+			memcpy(to, &data[readPos], wP0 << 2);
-			memcpy_fast(&to[wP0], &data[0], wP1 << 2);
+			memcpy(&to[wP0], &data[0], wP1 << 2);
 			readPos = (readPos + words) & (FIFO_SIF_W - 1);
 			size -= words;
--- a/pcsx2/Sio.cpp
+++ b/pcsx2/Sio.cpp
@ -319,7 +319,7 @@ SIO_WRITE memcardErase(u8 data)
 			{
 			case 0x82: // Erase
 				//siomode = SIO_DUMMY; // Nothing more to do here.
-				memcpy_fast(sio.buf, &header[1], 4);
+				memcpy(sio.buf, &header[1], 4);
 				sio.bufSize = 3;
 				mcd->EraseBlock();
 				break;
@ -367,7 +367,7 @@ SIO_WRITE memcardWrite(u8 data)
 			switch(data)
 			{
 			case 0x42: // Write
-				memcpy_fast(sio.buf, header, 4);
+				memcpy(sio.buf, header, 4);
 				once = true;
 				break;
@ -375,7 +375,7 @@ SIO_WRITE memcardWrite(u8 data)
 				if(once)
 				{
 					siomode = SIO_DUMMY; // Nothing more to do here.
-					memcpy_fast(sio.buf, &header[1], 4);
+					memcpy(sio.buf, &header[1], 4);
 					sio.bufSize = 3;
 					sio2.packet.recvVal1 = 0x1600; // Writing
@ -454,7 +454,7 @@ SIO_WRITE memcardRead(u8 data)
 			switch(data)
 			{
 			case 0x43: // Read
-				memcpy_fast(sio.buf, header, 4);
+				memcpy(sio.buf, header, 4);
 				once = true;
 				break;
@ -462,7 +462,7 @@ SIO_WRITE memcardRead(u8 data)
 				if(once)
 				{
 					siomode = SIO_DUMMY; // Nothing more to do here.
-					memcpy_fast(sio.buf, &header[1], 4);
+					memcpy(sio.buf, &header[1], 4);
 					sio.bufSize = 3;
 					sio2.packet.recvVal1 = 0x1700; // Reading
@ -624,7 +624,7 @@ SIO_WRITE sioWriteMemcard(u8 data)
 				cmd.mc_xor				= info.Xor;
 				cmd.Z					= mcd->term;
-				memcpy_fast(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag));
+				memcpy(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag));
 			}
 			break;
@ -698,7 +698,7 @@ SIO_WRITE sioWriteMemcardPSX(u8 data)
 		{
 		case 0x53: // PSX 'S'tate // haven't seen it happen yet
 			sio.buf[1] = mcd->FLAG;
-			memcpy_fast(&sio.buf[2], memcard_psx, 8);
+			memcpy(&sio.buf[2], memcard_psx, 8);
 			siomode = SIO_DUMMY;
 			break;
--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@ -296,9 +296,9 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
 		if (!idx)  CpuVU0->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
 		else	   CpuVU1->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
-		memcpy_fast(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr);
+		memcpy(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr);
 		size -= ((idx ? 0x4000 : 0x1000) - addr) / 4;
-		memcpy_fast(VUx.Micro, data, size);
+		memcpy(VUx.Micro, data, size);
 		vifX.tag.addr = size * 4;
 	}
@ -310,7 +310,7 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
 		// Clear VU memory before writing!
 		if (!idx)  CpuVU0->Clear(addr, size*4);
 		else	   CpuVU1->Clear(addr, size*4);
-		memcpy_fast(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy_fast
+		memcpy(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy
 		vifX.tag.addr   +=   size * 4;
 	}
--- a/pcsx2/gui/ConsoleLogger.cpp
+++ b/pcsx2/gui/ConsoleLogger.cpp
@ -584,7 +584,7 @@ bool ConsoleLogFrame::Write( ConsoleColors color, const wxString& text )
 	int endpos = m_CurQueuePos + text.Length();
 	m_QueueBuffer.MakeRoomFor( endpos + 1 );		// and the null!!
-	memcpy_fast( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() );
+	memcpy( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() );
 	m_CurQueuePos = endpos;
 	// this NULL may be overwritten if the next message sent doesn't perform a color change.
--- a/pcsx2/x86/iVU1micro.cpp
+++ b/pcsx2/x86/iVU1micro.cpp
@ -126,17 +126,17 @@ namespace VU1micro
 #endif
 		runCount++;
-		memcpy_const((u8*)backVUregs, (u8*)&VU1,		sizeof(VURegs));
+		memcpy((u8*)backVUregs, (u8*)&VU1,		sizeof(VURegs));
-		memcpy_const((u8*)backVUmem,	 (u8*)VU1.Mem,	0x4000);
+		memcpy((u8*)backVUmem,	 (u8*)VU1.Mem,	0x4000);
 		do { // while loop needed since not always will return finished
 			SuperVUExecuteProgram(VU1.VI[ REG_TPC ].UL & 0x3fff, 1);
 		} while( VU0.VI[ REG_VPU_STAT ].UL&0x100 );
-		memcpy_const((u8*)cmpVUregs,	(u8*)&VU1,			sizeof(VURegs));
+		memcpy((u8*)cmpVUregs,	(u8*)&VU1,			sizeof(VURegs));
-		memcpy_const((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
+		memcpy((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
-		memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+		memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-		memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+		memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
 		//Currently breaking mVU execution is disabled. Check mVUtestCycles<vuIndex>() in microVU_Compile.inl
 		runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);
@ -227,8 +227,8 @@ namespace VU1micro
 			if (mVUdebugNow) {
 				resetVUrec(1);
-				memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+				memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-				memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+				memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
 				runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -1016,8 +1016,8 @@ void SaveBranchState()
 	s_psaveInstInfo = g_pCurInstInfo;
 	// save all mmx regs
-	memcpy_const(s_saveMMXregs, mmxregs, sizeof(mmxregs));
+	memcpy(s_saveMMXregs, mmxregs, sizeof(mmxregs));
-	memcpy_const(s_saveXMMregs, xmmregs, sizeof(xmmregs));
+	memcpy(s_saveXMMregs, xmmregs, sizeof(xmmregs));
 }
 void LoadBranchState()
@ -1031,8 +1031,8 @@ void LoadBranchState()
 	g_pCurInstInfo = s_psaveInstInfo;
 	// restore all mmx regs
-	memcpy_const(mmxregs, s_saveMMXregs, sizeof(mmxregs));
+	memcpy(mmxregs, s_saveMMXregs, sizeof(mmxregs));
-	memcpy_const(xmmregs, s_saveXMMregs, sizeof(xmmregs));
+	memcpy(xmmregs, s_saveXMMregs, sizeof(xmmregs));
 }
 void iFlushCall(int flushtype)
@ -2179,7 +2179,7 @@ StartRecomp:
 			}
 		}
-		memcpy_fast(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc);
+		memcpy(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc);
 	}
 	s_pCurBlock->SetFnptr((uptr)recPtr);
--- a/pcsx2/x86/microVU.cpp
+++ b/pcsx2/x86/microVU.cpp
@ -181,8 +181,8 @@ __ri microProgram* mVUcreateProg(microVU& mVU, int startPC) {
 // Caches Micro Program
 __ri void mVUcacheProg(microVU& mVU, microProgram& prog) {
-	if (!mVU.index)	memcpy_const(prog.data, mVU.regs().Micro, 0x1000);
+	if (!mVU.index)	memcpy(prog.data, mVU.regs().Micro, 0x1000);
-	else			memcpy_const(prog.data, mVU.regs().Micro, 0x4000);
+	else			memcpy(prog.data, mVU.regs().Micro, 0x4000);
 	mVUdumpProg(mVU, prog);
 }
--- a/pcsx2/x86/microVU.h
+++ b/pcsx2/x86/microVU.h
@ -92,7 +92,7 @@ public:
 				blockEnd = blockList = newBlock;
 			}
-			memcpy_const(&newBlock->block, pBlock, sizeof(microBlock));
+			memcpy(&newBlock->block, pBlock, sizeof(microBlock));
 			thisBlock =  &newBlock->block;
 		}
 		return thisBlock;
--- a/pcsx2/x86/microVU_Branch.inl
+++ b/pcsx2/x86/microVU_Branch.inl
@ -170,7 +170,7 @@ void normBranchCompile(microVU& mVU, u32 branchPC) {
 }
 void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) {
-	memcpy_const(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
+	memcpy(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
 	mVUsetupBranch(mVU, mFC);
 	mVUbackupRegs(mVU);
@ -386,7 +386,7 @@ void condBranch(mV, microFlagCycles& mFC, int JMPcc) {
 			s32* ajmp = xJcc32((JccComparisonType)JMPcc); 
 			u32 bPC = iPC; // mVUcompile can modify iPC, mVUpBlock, and mVUregs so back them up
 			microBlock* pBlock = mVUpBlock;
-			memcpy_const(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
+			memcpy(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
 			incPC2(1);  // Get PC for branch not-taken
 			mVUcompile(mVU, xPC, (uptr)&mVUregs);
--- a/pcsx2/x86/microVU_Compile.inl
+++ b/pcsx2/x86/microVU_Compile.inl
@ -427,10 +427,10 @@ __fi void mVUinitFirstPass(microVU& mVU, uptr pState, u8* thisPtr) {
 	mVU.p					= 0;	// All blocks start at p index #0
 	mVU.q					= 0;	// All blocks start at q index #0
 	if ((uptr)&mVUregs != pState) {	// Loads up Pipeline State Info
-		memcpy_const((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo));
+		memcpy((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo));
 	}
 	if (doEarlyExit(mVU) && ((uptr)&mVU.prog.lpState != pState)) {
-		memcpy_const((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
+		memcpy((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
 	}
 	mVUblock.x86ptrStart	= thisPtr;
 	mVUpBlock				= mVUblocks[mVUstartPC/2]->add(&mVUblock); // Add this block to block manager
@ -530,7 +530,7 @@ void mVUDoTBit(microVU& mVU, microFlagCycles* mFC)
 void mVUSaveFlags(microVU& mVU,microFlagCycles &mFC, microFlagCycles &mFCBackup)
 {
-	memcpy_fast(&mFCBackup, &mFC, sizeof(microFlagCycles));
+	memcpy(&mFCBackup, &mFC, sizeof(microFlagCycles));
 	mVUsetFlags(mVU, mFCBackup);	   // Sets Up Flag instances
 }
 void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@ -86,7 +86,7 @@ public:
 				wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
 			);
 		}
-		memcpy_const(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
+		memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
 	}
 	void clear() {
 		for (int i = 0; i < hSize; i++) {
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@ -117,7 +117,7 @@ _vifT int nVifUnpack(const u8* data) {
 	if (ret == vif.tag.size) { // Full Transfer
 		if (v.bSize) { // Last transfer was partial
-			memcpy_aligned(&v.buffer[v.bSize], data, size);
+			memcpy(&v.buffer[v.bSize], data, size);
 			v.bSize		+= size;
 			size        = v.bSize;
 			data		= v.buffer;
@ -140,7 +140,7 @@ _vifT int nVifUnpack(const u8* data) {
 		v.bSize			= 0;
 	}
 	else { // Partial Transfer
-		memcpy_aligned(&v.buffer[v.bSize], data, size);
+		memcpy(&v.buffer[v.bSize], data, size);
 		v.bSize		 += size;
 		vif.tag.size -= ret;
--- a/pcsx2/x86/sVU_Compare.h
+++ b/pcsx2/x86/sVU_Compare.h
@ -131,15 +131,15 @@ void recSuperVU1::Execute(u32 cycles) {
 #endif
 	runCount++;
-	memcpy_const((u8*)backVUregs,	(u8*)&VU1,		sizeof(VURegs));
+	memcpy((u8*)backVUregs,	(u8*)&VU1,		sizeof(VURegs));
-	memcpy_const((u8*)backVUmem,	(u8*) VU1.Mem,	0x4000);
+	memcpy((u8*)backVUmem,	(u8*) VU1.Mem,	0x4000);
 	runMVU1(cycles);
-	memcpy_const((u8*)cmpVUregs,(u8*)&VU1,			sizeof(VURegs));
+	memcpy((u8*)cmpVUregs,(u8*)&VU1,			sizeof(VURegs));
-	memcpy_const((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
+	memcpy((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
-	memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+	memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-	memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+	memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
 	runSVU1(cycles);
 	if ((memcmp((u8*)cmpVUregs, (u8*)&VU1, (16*32) + (16*16))) || (memcmp((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000))) {
@ -230,8 +230,8 @@ void recSuperVU1::Execute(u32 cycles) {
 			resetMVU1();
-			memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+			memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-			memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+			memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
 			runMVU1(cycles);
--- a/pcsx2/x86/sVU_zerorec.cpp
+++ b/pcsx2/x86/sVU_zerorec.cpp
@ -898,7 +898,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
 #ifdef SUPERVU_CACHING
 		//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
 		r.pmem = malloc(r.size);
-		memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
+		memcpy(r.pmem, &VU->Micro[r.start], r.size);
 #endif
 		s_pFnHeader->ranges.push_back(r);
 	}
--- a/plugins/zerogs/dx/GS.h
+++ b/plugins/zerogs/dx/GS.h
@ -647,7 +647,6 @@ char *SysLibError();					// Gets previous error loading sysbols
 void SysCloseLibrary(void *lib);		// Closes Library
 void SysMessage(char *fmt, ...);
 extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
 extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);
 template <typename T>
--- a/plugins/zerogs/dx/Windows/zerogs.vcxproj
+++ b/plugins/zerogs/dx/Windows/zerogs.vcxproj
@ -148,7 +148,6 @@
    <ClCompile Include="Conf.cpp" />
    <ClCompile Include="..\GSmain.cpp" />
    <ClCompile Include="..\Mem.cpp" />
    <ClCompile Include="..\memcpy_amd.cpp" />
    <ClCompile Include="..\Regs.cpp" />
    <ClCompile Include="..\targets.cpp" />
    <ClCompile Include="Win32.cpp" />
@ -199,4 +198,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters
+++ b/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters
@ -24,9 +24,6 @@
    <ClCompile Include="..\Mem.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\memcpy_amd.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\Regs.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -94,4 +91,4 @@
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj
@ -154,7 +154,6 @@
    <ClCompile Include="Conf.cpp" />
    <ClCompile Include="..\GSmain.cpp" />
    <ClCompile Include="..\Mem.cpp" />
    <ClCompile Include="..\memcpy_amd.cpp" />
    <ClCompile Include="..\Regs.cpp" />
    <ClCompile Include="..\targets.cpp" />
    <ClCompile Include="Win32.cpp" />
@ -205,4 +204,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters
@ -24,9 +24,6 @@
    <ClCompile Include="..\Mem.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\memcpy_amd.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\Regs.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -94,4 +91,4 @@
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj
@ -154,7 +154,6 @@
    <ClCompile Include="Conf.cpp" />
    <ClCompile Include="..\GSmain.cpp" />
    <ClCompile Include="..\Mem.cpp" />
    <ClCompile Include="..\memcpy_amd.cpp" />
    <ClCompile Include="..\Regs.cpp" />
    <ClCompile Include="..\targets.cpp" />
    <ClCompile Include="Win32.cpp" />
@ -205,4 +204,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters
@ -24,9 +24,6 @@
    <ClCompile Include="..\Mem.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\memcpy_amd.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\Regs.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -94,4 +91,4 @@
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/memcpy_amd.cpp
+++ b/plugins/zerogs/dx/memcpy_amd.cpp
@ -1,479 +0,0 @@
 /******************************************************************************
 Copyright (c) 2001 Advanced Micro Devices, Inc.
 LIMITATION OF LIABILITY:  THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
 EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
 NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
 PARTICULAR PURPOSE.  IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
 DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
 BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
 INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
 OF SUCH DAMAGES.  BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
 OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
 NOT APPLY TO YOU.
 AMD does not assume any responsibility for any errors which may appear in the
 Materials nor any responsibility to support or update the Materials.  AMD retains
 the right to make changes to its test specifications at any time, without notice.
 NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
 further information, software, technical information, know-how, or show-how
 available to you.
 So that all may benefit from your experience, please report  any  problems
 or  suggestions about this software to 3dsdk.support@amd.com
 AMD Developer Technologies, M/S 585
 Advanced Micro Devices, Inc.
 5900 E. Ben White Blvd.
 Austin, TX 78741
 3dsdk.support@amd.com
 ******************************************************************************/
 #include <assert.h>
 /*****************************************************************************
 MEMCPY_AMD.CPP
 ******************************************************************************/
 // Very optimized memcpy() routine for AMD Athlon and Duron family.
 // This code uses any of FOUR different basic copy methods, depending
 // on the transfer size.
 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
 // "Streaming Store"), and also uses the software prefetch instructions,
 // be sure you're running on Athlon/Duron or other recent CPU before calling!
 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".
 #define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
 // also using the "unrolled loop" optimization.   This code uses
 // the software prefetch instruction to get the data into the cache.
 #define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
 #define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
 //#include <stddef.h>
 // Inline assembly syntax for use with Visual C++
 #ifdef _WIN32
 #include <windows.h>
 #endif
 #include "PS2Etypes.h"
 extern "C" {
 #if defined(_MSC_VER) && !defined(__x86_64__)
 void * memcpy_amd(void *dest, const void *src, size_t n)
 {
    __asm {
 	mov		ecx, [n]		; number of bytes to copy
 	mov		edi, [dest]		; destination
 	mov		esi, [src]		; source
 	mov		ebx, ecx		; keep a copy of count
 	cld
 	cmp		ecx, TINY_BLOCK_COPY
 	jb		$memcpy_ic_3	; tiny? skip mmx copy
 	cmp		ecx, 32*1024		; don't align between 32k-64k because
 	jbe		$memcpy_do_align	;  it appears to be slower
 	cmp		ecx, 64*1024
 	jbe		$memcpy_align_done
 $memcpy_do_align:
 	mov		ecx, 8			; a trick that's faster than rep movsb...
 	sub		ecx, edi		; align destination to qword
 	and		ecx, 111b		; get the low bits
 	sub		ebx, ecx		; update copy count
 	neg		ecx				; set up to jump into the array
 	add		ecx, offset $memcpy_align_done
 	jmp		ecx				; jump to array of movsb's
 align 4
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 $memcpy_align_done:			; destination is dword aligned
 	mov		ecx, ebx		; number of bytes left to copy
 	shr		ecx, 6			; get 64-byte block count
 	jz		$memcpy_ic_2	; finish the last few bytes
 	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
 	jae		$memcpy_uc_test
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
 align 16
 $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
 	movq	mm0, [esi+0]	; read 64 bits
 	movq	mm1, [esi+8]
 	movq	[edi+0], mm0	; write 64 bits
 	movq	[edi+8], mm1	;    note:  the normal movq writes the
 	movq	mm2, [esi+16]	;    data to cache; a cache line will be
 	movq	mm3, [esi+24]	;    allocated as needed, to store the data
 	movq	[edi+16], mm2
 	movq	[edi+24], mm3
 	movq	mm0, [esi+32]
 	movq	mm1, [esi+40]
 	movq	[edi+32], mm0
 	movq	[edi+40], mm1
 	movq	mm2, [esi+48]
 	movq	mm3, [esi+56]
 	movq	[edi+48], mm2
 	movq	[edi+56], mm3
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
 	dec		ecx				; count down
 	jnz		$memcpy_ic_1	; last 64-byte block?
 $memcpy_ic_2:
 	mov		ecx, ebx		; has valid low 6 bits of the byte count
 $memcpy_ic_3:
 	shr		ecx, 2			; dword count
 	and		ecx, 1111b		; only look at the "remainder" bits
 	neg		ecx				; set up to jump into the array
 	add		ecx, offset $memcpy_last_few
 	jmp		ecx				; jump to array of movsd's
 $memcpy_uc_test:
 	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
 	jae		$memcpy_bp_1
 $memcpy_64_test:
 	or		ecx, ecx		; tail end of block prefetch will jump here
 	jz		$memcpy_ic_2	; no more 64-byte blocks left
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 align 16
 $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
 	movq	mm0,[esi+0]		; read 64 bits
 	add		edi,64			; update destination pointer
 	movq	mm1,[esi+8]
 	add		esi,64			; update source pointer
 	movq	mm2,[esi-48]
 	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
 	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
 	movntq	[edi-56], mm1	;    from READING the destination address
 	movq	mm1,[esi-32]	;    into the cache, only to be over-written
 	movntq	[edi-48], mm2	;    so that also helps performance
 	movq	mm2,[esi-24]
 	movntq	[edi-40], mm0
 	movq	mm0,[esi-16]
 	movntq	[edi-32], mm1
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
 	dec		ecx
 	movntq	[edi-8], mm1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 	jmp		$memcpy_ic_2		; almost done
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
 $memcpy_bp_1:			; large blocks, block prefetch copy
 	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
 	jl		$memcpy_64_test			; no, back to regular uncached copy
 	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
 	add		esi, CACHEBLOCK * 64	; move to the top of the block
 align 16
 $memcpy_bp_2:
 	mov		edx, [esi-64]		; grab one address per cache line
 	mov		edx, [esi-128]		; grab one address per cache line
 	sub		esi, 128			; go reverse order to suppress HW prefetcher
 	dec		eax					; count down the cache lines
 	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
 	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
 align 16
 $memcpy_bp_3:
 	movq	mm0, [esi   ]		; read 64 bits
 	movq	mm1, [esi+ 8]
 	movq	mm2, [esi+16]
 	movq	mm3, [esi+24]
 	movq	mm4, [esi+32]
 	movq	mm5, [esi+40]
 	movq	mm6, [esi+48]
 	movq	mm7, [esi+56]
 	add		esi, 64				; update source pointer
 	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
 	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
 	movntq	[edi+16], mm2		;    from READING the destination address
 	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
 	movntq	[edi+32], mm4		;    so that also helps performance
 	movntq	[edi+40], mm5
 	movntq	[edi+48], mm6
 	movntq	[edi+56], mm7
 	add		edi, 64				; update dest pointer
 	dec		eax					; count down
 	jnz		$memcpy_bp_3		; keep copying
 	sub		ecx, CACHEBLOCK		; update the 64-byte block count
 	jmp		$memcpy_bp_1		; keep processing chunks
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
 align 4
 	movsd
 	movsd			; perform last 1-15 dword copies
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd			; perform last 1-7 dword copies
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 $memcpy_last_few:		; dword aligned from before movsd's
 	mov		ecx, ebx	; has valid low 2 bits of the byte count
 	and		ecx, 11b	; the last few cows must come home
 	jz		$memcpy_final	; no more, let's leave
 	rep		movsb		; the last 1, 2, or 3 bytes
 $memcpy_final:
 	emms				; clean up the MMX state
 	sfence				; flush the write buffer
 	mov		eax, [dest]	; ret value = destination pointer
    }
 }
 // mmx memcpy implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
 // (zerofrog)
 u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
 {
 	assert( (cmpsize&7) == 0 );
 	__asm {
 push esi
 		mov ecx, cmpsize
 		mov edx, src1
 		mov esi, src2
 		cmp ecx, 32
 		jl Done4
 		// custom test first 8 to make sure things are ok
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pand mm0, mm1
 		movq mm2, [esi+16]
 		pmovmskb eax, mm0
 		movq mm3, [esi+24]
 		// check if eq
 		cmp eax, 0xff
 		je NextComp
 		mov eax, 1
 		jmp End
 NextComp:
 		pcmpeqd mm2, [edx+16]
 		pcmpeqd mm3, [edx+24]
 		pand mm2, mm3
 		pmovmskb eax, mm2
 		sub ecx, 32
 		add esi, 32
 		add edx, 32
 		// check if eq
 		cmp eax, 0xff
 		je ContinueTest
 		mov eax, 1
 		jmp End
 		cmp ecx, 64
 		jl Done8
 Cmp8:
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		movq mm2, [esi+16]
 		movq mm3, [esi+24]
 		movq mm4, [esi+32]
 		movq mm5, [esi+40]
 		movq mm6, [esi+48]
 		movq mm7, [esi+56]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pcmpeqd mm2, [edx+16]
 		pcmpeqd mm3, [edx+24]
 		pand mm0, mm1
 		pcmpeqd mm4, [edx+32]
 		pand mm0, mm2
 		pcmpeqd mm5, [edx+40]
 		pand mm0, mm3
 		pcmpeqd mm6, [edx+48]
 		pand mm0, mm4
 		pcmpeqd mm7, [edx+56]
 		pand mm0, mm5
 		pand mm0, mm6
 		pand mm0, mm7
 		pmovmskb eax, mm0
 		// check if eq
 		cmp eax, 0xff
 		je Continue
 		mov eax, 1
 		jmp End
 Continue:
 		sub ecx, 64
 		add esi, 64
 		add edx, 64
 ContinueTest:
 		cmp ecx, 64
 		jge Cmp8
 Done8:
 		test ecx, 0x20
 		jz Done4
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		movq mm2, [esi+16]
 		movq mm3, [esi+24]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pcmpeqd mm2, [edx+16]
 		pcmpeqd mm3, [edx+24]
 		pand mm0, mm1
 		pand mm0, mm2
 		pand mm0, mm3
 		pmovmskb eax, mm0
 		sub ecx, 32
 		add esi, 32
 		add edx, 32
 		// check if eq
 		cmp eax, 0xff
 		je Done4
 		mov eax, 1
 		jmp End
 Done4:
 		cmp ecx, 24
 		jne Done2
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		movq mm2, [esi+16]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pcmpeqd mm2, [edx+16]
 		pand mm0, mm1
 		pand mm0, mm2
 		pmovmskb eax, mm0
 		// check if eq
 		cmp eax, 0xff
 		setne al
 		jmp End
 Done2:
 		cmp ecx, 16
 		jne Done1
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pand mm0, mm1
 		pmovmskb eax, mm0
 		// check if eq
 		cmp eax, 0xff
 		setne al
 		jmp End
 Done1:
 		cmp ecx, 8
 		jne Done
 		mov eax, [esi]
 		mov esi, [esi+4]
 		cmp eax, [edx]
 		je Next
 		mov eax, 1
 		jmp End
 Next:
 		cmp esi, [edx+4]
 		setne al
 		jmp End
 Done:
 		xor eax, eax
 End:
 		pop esi
 		emms
 	}
 }
 #else // _MSC_VER
 // assume gcc or mingw or win x64
 #include <memory.h>
 #include <string.h>
 void * memcpy_amd(void *dest, const void *src, size_t n)
 {
 memcpy(dest, src, n);
 return dest;
 }
 #endif
 }
--- a/plugins/zerogs/dx/targets.cpp
+++ b/plugins/zerogs/dx/targets.cpp
@ -2026,7 +2026,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 		targ->clut.resize(clutsize);
 		if( tex0.cpsm <= 1 ) { // 32 bit
-			memcpy_amd(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
+			memcpy(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
 		}
 		else {
 			u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
@ -2110,7 +2110,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 			targ->memory->ref = 1;
 		}
-		memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
+		memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
 		u8* psrc = (u8*)(ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy);
@ -2136,7 +2136,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 				targ->memory->ref = 1;
 			}
-			memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
+			memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
 			// needs to be 8 bit, use xmm for unpacking
 			u16* dst = (u16*)lock.pBits;
@ -2219,7 +2219,7 @@ Z16Loop:
 				targ->memory = NULL;
 			}
-			memcpy_amd(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height );
+			memcpy(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height );
 		}
 	}
--- a/plugins/zerogs/dx/zerogs.cpp
+++ b/plugins/zerogs/dx/zerogs.cpp
@ -2239,7 +2239,7 @@ void ZeroGS::Flush(int context)
 					}
 					if( curvb.tex0.cpsm <= 1 ) { // 32 bit
-						memcpy_amd(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
+						memcpy(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
 					}
 					else {
 						u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
@ -5087,7 +5087,7 @@ void ZeroGS::CaptureFrame()
 	BYTE* pend = (BYTE*)lock.pBits + (conf.height-1)*width*4;
 	for(int i = 0; i < conf.height; ++i) {
-		memcpy_amd(&mem[width*4*i], pend - width*4*i, width * 4);
+		memcpy(&mem[width*4*i], pend - width*4*i, width * 4);
 	}
 	s_ptexAVICapture->UnlockRect();
--- a/plugins/zerogs/opengl/CMakeLists.txt
+++ b/plugins/zerogs/opengl/CMakeLists.txt
@ -36,7 +36,6 @@ set(zerogsSources
 	GSmain.cpp
 	GLWinX11.cpp
 	Mem.cpp
 	memcpy_amd.cpp
 	rasterfont.cpp
 	Regs.cpp
 	targets.cpp
--- a/plugins/zerogs/opengl/GS.h
+++ b/plugins/zerogs/opengl/GS.h
@ -728,7 +728,6 @@ char *SysLibError();					// Gets previous error loading sysbols
 void SysCloseLibrary(void *lib);		// Closes Library
 void SysMessage(char *fmt, ...);
 extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
 extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);
 template <typename T>
--- a/plugins/zerogs/opengl/Makefile.am
+++ b/plugins/zerogs/opengl/Makefile.am
@ -23,7 +23,7 @@ libZeroGSogl_LDFLAGS+=-Wl,-soname,@ZEROGS_SONAME@
 libZeroGSogl_LDADD=$(libZeroGSogl_a_OBJECTS)
 libZeroGSogl_a_SOURCES = \
-GSmain.cpp  memcpy_amd.cpp  Regs.cpp     x86.cpp     zpipe.cpp \
+GSmain.cpp  Regs.cpp         x86.cpp     zpipe.cpp \
 Mem.cpp     rasterfont.cpp  targets.cpp  zerogs.cpp GifTransfer.cpp GLWinX11.cpp
 libZeroGSogl_a_SOURCES += x86-32.S
--- a/plugins/zerogs/opengl/memcpy_amd.cpp
+++ b/plugins/zerogs/opengl/memcpy_amd.cpp
@ -1,478 +0,0 @@
 /******************************************************************************
 Copyright (c) 2001 Advanced Micro Devices, Inc.
 LIMITATION OF LIABILITY:  THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
 EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
 NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
 PARTICULAR PURPOSE.  IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
 DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
 BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
 INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
 OF SUCH DAMAGES.  BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
 OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
 NOT APPLY TO YOU.
 AMD does not assume any responsibility for any errors which may appear in the
 Materials nor any responsibility to support or update the Materials.  AMD retains
 the right to make changes to its test specifications at any time, without notice.
 NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
 further information, software, technical information, know-how, or show-how
 available to you.
 So that all may benefit from your experience, please report  any  problems
 or  suggestions about this software to 3dsdk.support@amd.com
 AMD Developer Technologies, M/S 585
 Advanced Micro Devices, Inc.
 5900 E. Ben White Blvd.
 Austin, TX 78741
 3dsdk.support@amd.com
 ******************************************************************************/
 #include <assert.h>
 /*****************************************************************************
 MEMCPY_AMD.CPP
 ******************************************************************************/
 // Very optimized memcpy() routine for AMD Athlon and Duron family.
 // This code uses any of FOUR different basic copy methods, depending
 // on the transfer size.
 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
 // "Streaming Store"), and also uses the software prefetch instructions,
 // be sure you're running on Athlon/Duron or other recent CPU before calling!
 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".
 #define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
 // also using the "unrolled loop" optimization.   This code uses
 // the software prefetch instruction to get the data into the cache.
 #define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
 #define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
 //#include <stddef.h>
 // Inline assembly syntax for use with Visual C++
 #ifdef _WIN32
 #include <windows.h>
 #endif
 extern "C" {
 #include "PS2Etypes.h"
 #if defined(_MSC_VER)
 void * memcpy_amd(void *dest, const void *src, size_t n)
 {
    __asm {
 	mov		ecx, [n]		; number of bytes to copy
 	mov		edi, [dest]		; destination
 	mov		esi, [src]		; source
 	mov		ebx, ecx		; keep a copy of count
 	cld
 	cmp		ecx, TINY_BLOCK_COPY
 	jb		$memcpy_ic_3	; tiny? skip mmx copy
 	cmp		ecx, 32*1024		; don't align between 32k-64k because
 	jbe		$memcpy_do_align	;  it appears to be slower
 	cmp		ecx, 64*1024
 	jbe		$memcpy_align_done
 $memcpy_do_align:
 	mov		ecx, 8			; a trick that's faster than rep movsb...
 	sub		ecx, edi		; align destination to qword
 	and		ecx, 111b		; get the low bits
 	sub		ebx, ecx		; update copy count
 	neg		ecx				; set up to jump into the array
 	add		ecx, offset $memcpy_align_done
 	jmp		ecx				; jump to array of movsb's
 align 4
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 	movsb
 $memcpy_align_done:			; destination is dword aligned
 	mov		ecx, ebx		; number of bytes left to copy
 	shr		ecx, 6			; get 64-byte block count
 	jz		$memcpy_ic_2	; finish the last few bytes
 	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
 	jae		$memcpy_uc_test
 // This is small block copy that uses the MMX registers to copy 8 bytes
 // at a time.  It uses the "unrolled loop" optimization, and also uses
 // the software prefetch instruction to get the data into the cache.
 align 16
 $memcpy_ic_1:			; 64-byte block copies, in-cache copy
 	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
 	movq	mm0, [esi+0]	; read 64 bits
 	movq	mm1, [esi+8]
 	movq	[edi+0], mm0	; write 64 bits
 	movq	[edi+8], mm1	;    note:  the normal movq writes the
 	movq	mm2, [esi+16]	;    data to cache; a cache line will be
 	movq	mm3, [esi+24]	;    allocated as needed, to store the data
 	movq	[edi+16], mm2
 	movq	[edi+24], mm3
 	movq	mm0, [esi+32]
 	movq	mm1, [esi+40]
 	movq	[edi+32], mm0
 	movq	[edi+40], mm1
 	movq	mm2, [esi+48]
 	movq	mm3, [esi+56]
 	movq	[edi+48], mm2
 	movq	[edi+56], mm3
 	add		esi, 64			; update source pointer
 	add		edi, 64			; update destination pointer
 	dec		ecx				; count down
 	jnz		$memcpy_ic_1	; last 64-byte block?
 $memcpy_ic_2:
 	mov		ecx, ebx		; has valid low 6 bits of the byte count
 $memcpy_ic_3:
 	shr		ecx, 2			; dword count
 	and		ecx, 1111b		; only look at the "remainder" bits
 	neg		ecx				; set up to jump into the array
 	add		ecx, offset $memcpy_last_few
 	jmp		ecx				; jump to array of movsd's
 $memcpy_uc_test:
 	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
 	jae		$memcpy_bp_1
 $memcpy_64_test:
 	or		ecx, ecx		; tail end of block prefetch will jump here
 	jz		$memcpy_ic_2	; no more 64-byte blocks left
 // For larger blocks, which will spill beyond the cache, it's faster to
 // use the Streaming Store instruction MOVNTQ.   This write instruction
 // bypasses the cache and writes straight to main memory.  This code also
 // uses the software prefetch instruction to pre-read the data.
 align 16
 $memcpy_uc_1:				; 64-byte blocks, uncached copy
 	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
 	movq	mm0,[esi+0]		; read 64 bits
 	add		edi,64			; update destination pointer
 	movq	mm1,[esi+8]
 	add		esi,64			; update source pointer
 	movq	mm2,[esi-48]
 	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
 	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
 	movntq	[edi-56], mm1	;    from READING the destination address
 	movq	mm1,[esi-32]	;    into the cache, only to be over-written
 	movntq	[edi-48], mm2	;    so that also helps performance
 	movq	mm2,[esi-24]
 	movntq	[edi-40], mm0
 	movq	mm0,[esi-16]
 	movntq	[edi-32], mm1
 	movq	mm1,[esi-8]
 	movntq	[edi-24], mm2
 	movntq	[edi-16], mm0
 	dec		ecx
 	movntq	[edi-8], mm1
 	jnz		$memcpy_uc_1	; last 64-byte block?
 	jmp		$memcpy_ic_2		; almost done
 // For the largest size blocks, a special technique called Block Prefetch
 // can be used to accelerate the read operations.   Block Prefetch reads
 // one address per cache line, for a series of cache lines, in a short loop.
 // This is faster than using software prefetch.  The technique is great for
 // getting maximum read bandwidth, especially in DDR memory systems.
 $memcpy_bp_1:			; large blocks, block prefetch copy
 	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
 	jl		$memcpy_64_test			; no, back to regular uncached copy
 	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
 	add		esi, CACHEBLOCK * 64	; move to the top of the block
 align 16
 $memcpy_bp_2:
 	mov		edx, [esi-64]		; grab one address per cache line
 	mov		edx, [esi-128]		; grab one address per cache line
 	sub		esi, 128			; go reverse order to suppress HW prefetcher
 	dec		eax					; count down the cache lines
 	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
 	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
 align 16
 $memcpy_bp_3:
 	movq	mm0, [esi   ]		; read 64 bits
 	movq	mm1, [esi+ 8]
 	movq	mm2, [esi+16]
 	movq	mm3, [esi+24]
 	movq	mm4, [esi+32]
 	movq	mm5, [esi+40]
 	movq	mm6, [esi+48]
 	movq	mm7, [esi+56]
 	add		esi, 64				; update source pointer
 	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
 	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
 	movntq	[edi+16], mm2		;    from READING the destination address
 	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
 	movntq	[edi+32], mm4		;    so that also helps performance
 	movntq	[edi+40], mm5
 	movntq	[edi+48], mm6
 	movntq	[edi+56], mm7
 	add		edi, 64				; update dest pointer
 	dec		eax					; count down
 	jnz		$memcpy_bp_3		; keep copying
 	sub		ecx, CACHEBLOCK		; update the 64-byte block count
 	jmp		$memcpy_bp_1		; keep processing chunks
 // The smallest copy uses the X86 "movsd" instruction, in an optimized
 // form which is an "unrolled loop".   Then it handles the last few bytes.
 align 4
 	movsd
 	movsd			; perform last 1-15 dword copies
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd			; perform last 1-7 dword copies
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 	movsd
 $memcpy_last_few:		; dword aligned from before movsd's
 	mov		ecx, ebx	; has valid low 2 bits of the byte count
 	and		ecx, 11b	; the last few cows must come home
 	jz		$memcpy_final	; no more, let's leave
 	rep		movsb		; the last 1, 2, or 3 bytes
 $memcpy_final:
 	emms				; clean up the MMX state
 	sfence				; flush the write buffer
 	mov		eax, [dest]	; ret value = destination pointer
    }
 }
 // mmx memcpy implementation, size has to be a multiple of 8
 // returns 0 is equal, nonzero value if not equal
 // ~10 times faster than standard memcmp
 // (zerofrog)
 u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
 {
 	assert( (cmpsize&7) == 0 );
 	__asm {
 push esi
 		mov ecx, cmpsize
 		mov edx, src1
 		mov esi, src2
 		cmp ecx, 32
 		jl Done4
 		// custom test first 8 to make sure things are ok
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pand mm0, mm1
 		movq mm2, [esi+16]
 		pmovmskb eax, mm0
 		movq mm3, [esi+24]
 		// check if eq
 		cmp eax, 0xff
 		je NextComp
 		mov eax, 1
 		jmp End
 NextComp:
 		pcmpeqd mm2, [edx+16]
 		pcmpeqd mm3, [edx+24]
 		pand mm2, mm3
 		pmovmskb eax, mm2
 		sub ecx, 32
 		add esi, 32
 		add edx, 32
 		// check if eq
 		cmp eax, 0xff
 		je ContinueTest
 		mov eax, 1
 		jmp End
 		cmp ecx, 64
 		jl Done8
 Cmp8:
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		movq mm2, [esi+16]
 		movq mm3, [esi+24]
 		movq mm4, [esi+32]
 		movq mm5, [esi+40]
 		movq mm6, [esi+48]
 		movq mm7, [esi+56]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pcmpeqd mm2, [edx+16]
 		pcmpeqd mm3, [edx+24]
 		pand mm0, mm1
 		pcmpeqd mm4, [edx+32]
 		pand mm0, mm2
 		pcmpeqd mm5, [edx+40]
 		pand mm0, mm3
 		pcmpeqd mm6, [edx+48]
 		pand mm0, mm4
 		pcmpeqd mm7, [edx+56]
 		pand mm0, mm5
 		pand mm0, mm6
 		pand mm0, mm7
 		pmovmskb eax, mm0
 		// check if eq
 		cmp eax, 0xff
 		je Continue
 		mov eax, 1
 		jmp End
 Continue:
 		sub ecx, 64
 		add esi, 64
 		add edx, 64
 ContinueTest:
 		cmp ecx, 64
 		jge Cmp8
 Done8:
 		test ecx, 0x20
 		jz Done4
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		movq mm2, [esi+16]
 		movq mm3, [esi+24]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pcmpeqd mm2, [edx+16]
 		pcmpeqd mm3, [edx+24]
 		pand mm0, mm1
 		pand mm0, mm2
 		pand mm0, mm3
 		pmovmskb eax, mm0
 		sub ecx, 32
 		add esi, 32
 		add edx, 32
 		// check if eq
 		cmp eax, 0xff
 		je Done4
 		mov eax, 1
 		jmp End
 Done4:
 		cmp ecx, 24
 		jne Done2
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		movq mm2, [esi+16]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pcmpeqd mm2, [edx+16]
 		pand mm0, mm1
 		pand mm0, mm2
 		pmovmskb eax, mm0
 		// check if eq
 		cmp eax, 0xff
 		setne al
 		jmp End
 Done2:
 		cmp ecx, 16
 		jne Done1
 		movq mm0, [esi]
 		movq mm1, [esi+8]
 		pcmpeqd mm0, [edx]
 		pcmpeqd mm1, [edx+8]
 		pand mm0, mm1
 		pmovmskb eax, mm0
 		// check if eq
 		cmp eax, 0xff
 		setne al
 		jmp End
 Done1:
 		cmp ecx, 8
 		jne Done
 		mov eax, [esi]
 		mov esi, [esi+4]
 		cmp eax, [edx]
 		je Next
 		mov eax, 1
 		jmp End
 Next:
 		cmp esi, [edx+4]
 		setne al
 		jmp End
 Done:
 		xor eax, eax
 End:
 		pop esi
 		emms
 	}
 }
 #else // _MSC_VER
 // assume gcc
 #include <memory.h>
 #include <string.h>
 void * memcpy_amd(void *dest, const void *src, size_t n)
 {
 memcpy(dest, src, n);
 return dest;
 }
 #endif
 }
--- a/plugins/zerogs/opengl/targets.cpp
+++ b/plugins/zerogs/opengl/targets.cpp
@ -1789,7 +1789,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 		targ->clut.resize(clutsize);
 		if( tex0.cpsm <= 1 ) { // 32 bit
-			memcpy_amd(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize);
+			memcpy(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize);
 		}
 		else {
 			u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
@ -1854,7 +1854,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 		assert(targ->ptex->ref > 0 );
 	}
-	memcpy_amd(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
+	memcpy(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
 	vector<u8> texdata;
 	u8* ptexdata = NULL;
--- a/plugins/zerogs/opengl/zerogs.cpp
+++ b/plugins/zerogs/opengl/zerogs.cpp
@ -2568,7 +2568,7 @@ void ZeroGS::Flush(int context)
 	g_nCurVBOIndex = (g_nCurVBOIndex+1)%g_vboBuffers.size();
 	glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
 //	void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-//	memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
+//	memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
 //	glUnmapBuffer(GL_ARRAY_BUFFER);
 	SET_STREAM();
@ -2652,7 +2652,7 @@ void ZeroGS::Flush(int context)
 					}
 					if( curvb.tex0.cpsm <= 1 ) { // 32 bit
-						memcpy_amd(&data[0], g_pbyGSClut+nClutOffset, clutsize);
+						memcpy(&data[0], g_pbyGSClut+nClutOffset, clutsize);
 					}
 					else {
 						u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
@ -5779,7 +5779,7 @@ void ZeroGS::CaptureFrame()
 //  u8* pend = (u8*)&data[0] + (nBackbufferHeight-1)*nBackbufferWidth*4;
 //  for(int i = 0; i < conf.height; ++i) {
-//	  memcpy_amd(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4);
+//	  memcpy(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4);
 //  }
 	int fps = SMODE1->CMOD == 3 ? 50 : 60;
--- a/plugins/zerogs/opengl/zerogs.h
+++ b/plugins/zerogs/opengl/zerogs.h
@ -436,7 +436,7 @@ namespace ZeroGS {
 			if( nCount + nVerts > nNumVertices ) {
 				// recreate except with a bigger count
 				VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU)*nNumVertices*2, 256);
-				memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
+				memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
 				nNumVertices *= 2;
 				assert( nCount + nVerts <= nNumVertices );
 				_aligned_free(pBufferData);
--- a/plugins/zzogl-pg-cg/opengl/CMakeLists.txt
+++ b/plugins/zzogl-pg-cg/opengl/CMakeLists.txt
@ -55,7 +55,6 @@ set(zzoglSources
    GSmain.cpp
    HostMemory.cpp
    Mem.cpp
    # memcpy_amd.cpp
    Mem_Swizzle.cpp
    Mem_Tables.cpp
    Profile.cpp
--- a/plugins/zzogl-pg-cg/opengl/Util.h
+++ b/plugins/zzogl-pg-cg/opengl/Util.h
@ -68,7 +68,6 @@ extern "C" char* CALLBACK PS2EgetLibName(void);
 #include "GSDump.h"
 #include "Utilities/MemcpyFast.h"
 #define memcpy_amd memcpy_fast
 extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path
--- a/plugins/zzogl-pg-cg/opengl/ZZClut.cpp
+++ b/plugins/zzogl-pg-cg/opengl/ZZClut.cpp
@ -493,7 +493,7 @@ template <>
 /*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
 {
    u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
-    memcpy_amd((u8*)dst, clut, clutsize);
+    memcpy((u8*)dst, clut, clutsize);
 }
 template <>
--- a/plugins/zzogl-pg-cg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg-cg/opengl/ZZoglFlush.cpp
@ -657,7 +657,7 @@ inline void FlushSetStream(VB& curvb)
 	g_nCurVBOIndex = (g_nCurVBOIndex + 1) % g_vboBuffers.size();
 	glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
 //	void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-//	memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
+//	memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
 //	glUnmapBuffer(GL_ARRAY_BUFFER);
 	SET_STREAM();
--- a/plugins/zzogl-pg-cg/opengl/ZZoglVB.h
+++ b/plugins/zzogl-pg-cg/opengl/ZZoglVB.h
@ -89,7 +89,7 @@ class VB
 			assert(pBufferData != NULL);
 			nNumVertices *= 2;
 			VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
-			memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
+			memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
 			assert(nCount <= nNumVertices);
 			_aligned_free(pBufferData);
 			pBufferData = ptemp;
--- a/plugins/zzogl-pg-cg/opengl/targets.cpp
+++ b/plugins/zzogl-pg-cg/opengl/targets.cpp
@ -1979,7 +1979,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 		assert(targ->ptex->ref > 0);
 	}
-	memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
+	memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
 	__aligned16 u8* ptexdata = NULL;
 	bool has_data = false;
--- a/plugins/zzogl-pg/opengl/Util.h
+++ b/plugins/zzogl-pg/opengl/Util.h
@ -86,7 +86,6 @@ typedef signed long long int64;
 #include "GSDump.h"
 #include "Utilities/MemcpyFast.h"
 #define memcpy_amd memcpy_fast
 extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path
--- a/plugins/zzogl-pg/opengl/ZZClut.cpp
+++ b/plugins/zzogl-pg/opengl/ZZClut.cpp
@ -489,7 +489,7 @@ template <>
 /*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
 {
    u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
-    memcpy_amd((u8*)dst, clut, clutsize);
+    memcpy((u8*)dst, clut, clutsize);
 }
 template <>
--- a/plugins/zzogl-pg/opengl/ZZMemoryTargets.cpp
+++ b/plugins/zzogl-pg/opengl/ZZMemoryTargets.cpp
@ -364,7 +364,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 		assert(targ->ptex->ref > 0);
 	}
-	memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
+	memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
 	__aligned16 u8* ptexdata = NULL;
 	bool has_data = false;
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
@ -535,7 +535,7 @@ inline void FlushSetStream(VB& curvb)
 //	void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-//	memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
+//	memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
 //	glUnmapBuffer(GL_ARRAY_BUFFER);
 	SET_STREAM();
--- a/plugins/zzogl-pg/opengl/ZZoglVB.h
+++ b/plugins/zzogl-pg/opengl/ZZoglVB.h
@ -89,7 +89,7 @@ class VB
 			assert(pBufferData != NULL);
 			nNumVertices *= 2;
 			VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
-			memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
+			memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
 			assert(nCount <= nNumVertices);
 			_aligned_free(pBufferData);
 			pBufferData = ptemp;