Merge pull request #215 from xsacha/memcpy

Remove some slow, redundant memcpy implementations: memcpy_const/memcpy_...
2014-09-12 19:57:57 +02:00 · 2014-09-12 19:57:57 +02:00 · 98d22f8b2e
parent b9e6024fe9 1d116cc23b
commit 98d22f8b2e
58 changed files with 116 additions and 1092 deletions
--- a/common/include/Utilities/SafeArray.inl
+++ b/common/include/Utilities/SafeArray.inl
@ -118,7 +118,7 @@ template< typename T >
 SafeArray<T>* SafeArray<T>::Clone() const
 {
 	SafeArray<T>* retval = new SafeArray<T>( m_size );
-	memcpy_fast( retval->GetPtr(), m_ptr, sizeof(T) * m_size );
+	memcpy( retval->GetPtr(), m_ptr, sizeof(T) * m_size );
 	return retval;
 }

@ -160,7 +160,7 @@ template< typename T, uint Alignment >
 SafeAlignedArray<T,Alignment>* SafeAlignedArray<T,Alignment>::Clone() const
 {
 	SafeAlignedArray<T,Alignment>* retval = new SafeAlignedArray<T,Alignment>( this->m_size );
-	memcpy_fast( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size );
+	memcpy( retval->GetPtr(), this->m_ptr, sizeof(T) * this->m_size );
 	return retval;
 }

@ -272,14 +272,14 @@ void SafeList<T>::Remove( int index )

 	int copylen = m_length - index;
 	if( copylen > 0 )
-		memcpy_fast( &m_ptr[index], &m_ptr[index+1], copylen );
+		memcpy( &m_ptr[index], &m_ptr[index+1], copylen );
 }

 template< typename T >
 SafeList<T>* SafeList<T>::Clone() const
 {
 	SafeList<T>* retval = new SafeList<T>( m_length );
-	memcpy_fast( retval->m_ptr, m_ptr, sizeof(T) * m_length );
+	memcpy( retval->m_ptr, m_ptr, sizeof(T) * m_length );
 	return retval;
 }

--- a/common/src/Utilities/FastFormatString.cpp
+++ b/common/src/Utilities/FastFormatString.cpp
@ -265,7 +265,7 @@ FastFormatUnicode& FastFormatUnicode::WriteV( const char* fmt, va_list argptr )
 	const uint inspos = m_Length;
 	const uint convLen = converted.Length();
 	m_dest->MakeRoomFor((inspos + convLen + 64) * sizeof(wxChar));
-	memcpy_fast( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) );
+	memcpy( &((wxChar*)m_dest->GetPtr())[inspos], converted.wc_str(), (convLen+1)*sizeof(wxChar) );
 	m_Length += convLen;

 	return *this;
--- a/common/src/x86emitter/jmp.cpp
+++ b/common/src/x86emitter/jmp.cpp
@ -60,7 +60,7 @@ void xSmartJump::SetTarget()
 		u8* destpos = xGetPtr();
 		const int copylen = (sptr)target - (sptr)saveme;

-		memcpy_fast( destpos, saveme, copylen );
+		memcpy( destpos, saveme, copylen );
 		xSetPtr( target - spacer );
 	}
 }
--- a/pcsx2/CDVD/CDVD.cpp
+++ b/pcsx2/CDVD/CDVD.cpp
@ -725,7 +725,7 @@ int cdvdReadSector() {
 		mdest[11] = 0;

 		// normal 2048 bytes of sector data
-		memcpy_const(&mdest[12], cdr.Transfer, 2048);
+		memcpy(&mdest[12], cdr.Transfer, 2048);

 		// 4 bytes of edc (not calculated at present)
 		mdest[2060] = 0;
@ -735,7 +735,7 @@ int cdvdReadSector() {
 	}
 	else
 	{
-		memcpy_fast( mdest, cdr.Transfer, cdvd.BlockSize);
+		memcpy( mdest, cdr.Transfer, cdvd.BlockSize);
 	}

 	// decrypt sector's bytes
@ -1567,7 +1567,7 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 				cdvd.Param[cdvd.ParamP-5], cdvd.Param[cdvd.ParamP-3], cdvd.Param[cdvd.ParamP-2], cdvd.Param[cdvd.ParamP-1]);
 			Console.WriteLn("RTC Write Sec %d Min %d Hr %d Day %d Month %d Year %d", cdvd.RTC.second, cdvd.RTC.minute,
 				cdvd.RTC.hour, cdvd.RTC.day, cdvd.RTC.month, cdvd.RTC.year);*/
-			//memcpy_fast((u8*)&cdvd.RTC, cdvd.Param, 7);
+			//memcpy((u8*)&cdvd.RTC, cdvd.Param, 7);
 			break;

 		case 0x0A: // sceCdReadNVM (2:3)
@ -1907,7 +1907,7 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 			}
 			else
 			{
-				memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
+				memcpy(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
 				cdvd.mg_size += cdvd.ParamC;
 				cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
 			}
@ -1915,9 +1915,9 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND

 		case 0x8E: // sceMgReadData
 			SetResultSize( std::min(16, cdvd.mg_size) );
-			memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
+			memcpy(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
 			cdvd.mg_size -= cdvd.ResultC;
-			memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
+			memcpy(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
 			break;

 		case 0x88: // secrman: __mechacon_auth_0x88	//for now it is the same; so, fall;)
@ -1984,7 +1984,7 @@ static void cdvdWrite16(u8 rt)		 // SCOMMAND
 		{
 			SetResultSize(3);//in:0
 			int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
-			memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
+			memcpy(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);

 			cdvd.mg_maxsize = 0; // don't allow any write
 			cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
--- a/pcsx2/CDVD/CDVDisoReader.cpp
+++ b/pcsx2/CDVD/CDVDisoReader.cpp
@ -422,7 +422,7 @@ s32 CALLBACK ISOreadSector(u8* tempbuffer, u32 lsn, int mode)
 		jNO_DEFAULT
 	}

-	memcpy_fast(tempbuffer, pbuffer, psize);
+	memcpy(tempbuffer, pbuffer, psize);

 	return 0;
 }
--- a/pcsx2/CDVD/CdRom.cpp
+++ b/pcsx2/CDVD/CdRom.cpp
@ -917,7 +917,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
 			}

 			cdsize = (bcr & 0xffff) * 4;
-			memcpy_fast(iopPhysMem(madr), cdr.pTransfer, cdsize);
+			memcpy(iopPhysMem(madr), cdr.pTransfer, cdsize);
 			psxCpu->Clear(madr, cdsize/4);
 			cdr.pTransfer+=cdsize;

@ -947,7 +947,7 @@ s32 CALLBACK cdvdDmaRead(s32 channel, u32* data, u32 bytesLeft, u32* bytesProces
 		return 10000;
 	}

-	memcpy_fast(data, cdr.pTransfer, wordsLeft);
+	memcpy(data, cdr.pTransfer, wordsLeft);
 	//psxCpu->Clear(madr, cdsize/4);
 	cdr.pTransfer+=wordsLeft;
 	*wordsProcessed = wordsLeft;
--- a/pcsx2/CDVD/InputIsoFile.cpp
+++ b/pcsx2/CDVD/InputIsoFile.cpp
@ -145,7 +145,7 @@ int InputIsoFile::FinishRead3(u8* dst, uint mode)
 	length = end - _offset;

 	uint read_offset = (m_current_lsn - m_read_lsn) * m_blocksize;
-	memcpy_fast(dst + diff, m_readbuffer + ndiff + read_offset, length);
+	memcpy(dst + diff, m_readbuffer + ndiff + read_offset, length);
 	
 	if (m_type == ISOTYPE_CD && diff >= 12)
 	{
--- a/pcsx2/CDVD/IsoFS/IsoFile.cpp
+++ b/pcsx2/CDVD/IsoFS/IsoFile.cpp
@ -161,7 +161,7 @@ int IsoFile::internalRead(void* dest, int off, int len)
 			slen = (int) (maxOffset - currentOffset);
 		}

-		memcpy_fast((u8*)dest + off, currentSector + sectorOffset, slen);
+		memcpy((u8*)dest + off, currentSector + sectorOffset, slen);

 		sectorOffset += slen;
 		currentOffset += slen;
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@ -442,27 +442,27 @@ extern __aligned(32) MTGS_BufferedData RingBuffer;
 inline void MemCopy_WrappedDest( const u128* src, u128* destBase, uint& destStart, uint destSize, uint len ) {
 	uint endpos = destStart + len;
 	if ( endpos < destSize ) {
-		memcpy_qwc(&destBase[destStart], src, len );
+		memcpy(&destBase[destStart], src, len*16);
 		destStart += len;
 	}
 	else {
 		uint firstcopylen = destSize - destStart;
-		memcpy_qwc(&destBase[destStart], src, firstcopylen );
+		memcpy(&destBase[destStart], src, firstcopylen*16);
 		destStart = endpos % destSize;
-		memcpy_qwc(destBase, src+firstcopylen, destStart );
+		memcpy(destBase, src+firstcopylen, destStart*16);
 	}
 }

 inline void MemCopy_WrappedSrc( const u128* srcBase, uint& srcStart, uint srcSize, u128* dest, uint len ) {
 	uint endpos = srcStart + len;
 	if ( endpos < srcSize ) {
-		memcpy_qwc(dest, &srcBase[srcStart], len );
+		memcpy(dest, &srcBase[srcStart], len*16);
 		srcStart += len;
 	}
 	else {
 		uint firstcopylen = srcSize - srcStart;
-		memcpy_qwc(dest, &srcBase[srcStart], firstcopylen );
+		memcpy(dest, &srcBase[srcStart], firstcopylen*16);
 		srcStart = endpos % srcSize;
-		memcpy_qwc(dest+firstcopylen, srcBase, srcStart );
+		memcpy(dest+firstcopylen, srcBase, srcStart*16);
 	}
 }
--- a/pcsx2/Gif_Unit.h
+++ b/pcsx2/Gif_Unit.h
@ -222,7 +222,7 @@ struct Gif_Path {
 		}
 		//DevCon.WriteLn("Realign Packet [%d]", curSize - offset);
 		if (intersect) memmove(buffer, &buffer[offset], curSize - offset);
-		else       memcpy_fast(buffer, &buffer[offset], curSize - offset);
+		else       memcpy(buffer, &buffer[offset], curSize - offset);
 		curSize      -= offset;
 		curOffset     = gsPack.size;
 		gsPack.offset = 0;
@ -241,8 +241,7 @@ struct Gif_Path {
 			mtgsReadWait(); // Let MTGS run to free up buffer space
 		}
 		pxAssertDev(curSize+size<=buffSize, "Gif Path Buffer Overflow!");
-		if (aligned) memcpy_qwc (&buffer[curSize], pMem, size/16);
-		else		 memcpy_fast(&buffer[curSize], pMem, size);
+		memcpy (&buffer[curSize], pMem, size);
 		curSize     += size;
 	}

--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -181,7 +181,7 @@ void SysMtgsThread::OpenPlugin()
 {
 	if( m_PluginOpened ) return;

-	memcpy_aligned( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
+	memcpy( RingBuffer.Regs, PS2MEM_GS, sizeof(PS2MEM_GS) );
 	GSsetBaseMem( RingBuffer.Regs );
 	GSirqCallback( dummyIrqCallback );

@ -626,7 +626,7 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 	if (syncRegs) {
 		ScopedLock lock(m_mtx_WaitGS);
 		// Completely synchronize GS and MTGS register states.
-		memcpy_fast(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
+		memcpy(RingBuffer.Regs, PS2MEM_GS, sizeof(RingBuffer.Regs));
 	}
 }

--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@ -217,7 +217,7 @@ __fi u32 VU_Thread::Read()

 __fi void VU_Thread::Read(void* dest, u32 size)
 {
-	memcpy_fast(dest, &buffer[read_pos], size);
+	memcpy(dest, &buffer[read_pos], size);
 	incReadPos(size_u32(size));
 }

@ -240,7 +240,7 @@ __fi void VU_Thread::Write(u32 val)
 }
 __fi void VU_Thread::Write(void* src, u32 size)
 {
-	memcpy_fast(GetWritePtr(), src, size);
+	memcpy(GetWritePtr(), src, size);
 	write_offset += size_u32(size);
 }

--- a/pcsx2/PluginManager.cpp
+++ b/pcsx2/PluginManager.cpp
@ -220,7 +220,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
 		// the transfer is most likely wrapped/partial.  We need to queue it into a linear buffer
 		// and then send it on its way on the next copy.

-		memcpy_qwc( path1queue, src128, data );
+		memcpy( path1queue, src128, data*16);
 		path1size = data;
 	}
 	else
@ -235,7 +235,7 @@ static void CALLBACK GS_Legacy_gifTransfer( const u32* src, u32 data )
 			if (src128 == RingBuffer.m_Ring)
 			{
 				pxAssert( (data+path1size) <= 0x400 );
-				memcpy_qwc( &path1queue[path1size], src128, data );
+				memcpy( &path1queue[path1size], src128, data*16);
 				path1size += data;
 			}
 			GSgifTransfer1( (u32*)path1queue, 0 );
@ -455,7 +455,7 @@ static s32 CALLBACK CDVD_getBuffer2(u8* buffer)
 	u8* pb = CDVD->getBuffer();
 	if(pb == NULL) return -2;

-	memcpy_fast( buffer, pb, lastReadSize );
+	memcpy( buffer, pb, lastReadSize );
 	return 0;
 }

--- a/pcsx2/R5900OpcodeImpl.cpp
+++ b/pcsx2/R5900OpcodeImpl.cpp
@ -216,7 +216,7 @@ static int __Deci2Call(int call, u32 *addr)
 					pdeciaddr += (d2ptr[4]+0xc) % 16;

 				const int copylen = std::min<uint>(255, d2ptr[1]-0xc);
-				memcpy_fast(deci2buffer, pdeciaddr, copylen );
+				memcpy(deci2buffer, pdeciaddr, copylen );
 				deci2buffer[copylen] = '\0';

 				eeConLog( ShiftJIS_ConvertString(deci2buffer) );
--- a/pcsx2/SPR.cpp
+++ b/pcsx2/SPR.cpp
@ -97,7 +97,7 @@ int  _SPR0chain()
 			//Taking an arbitary small value for games which like to check the QWC/MADR instead of STR, so get most of
 			//the cycle delay out of the way before the end.
 			partialqwc = spr0ch.qwc;
-			memcpy_qwc(pMem, &psSu128(spr0ch.sadr), partialqwc);
+			memcpy(pMem, &psSu128(spr0ch.sadr), partialqwc*16);

 			// clear VU mem also!
 			TestClearVUs(spr0ch.madr, partialqwc, true);
@ -151,7 +151,7 @@ void _SPR0interleave()
 			case MFD_RESERVED:
 				// clear VU mem also!
 				TestClearVUs(spr0ch.madr, spr0ch.qwc, true);
-				memcpy_qwc(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc);
+				memcpy(pMem, &psSu128(spr0ch.sadr), spr0ch.qwc*16);
 				break;
 		}
 		spr0ch.sadr += spr0ch.qwc * 16;
@ -322,7 +322,7 @@ __fi static void SPR1transfer(const void* data, int qwc)
 		TestClearVUs(spr1ch.madr, spr1ch.qwc, false);
 	}

-	memcpy_qwc(&psSu128(spr1ch.sadr), data, qwc);
+	memcpy(&psSu128(spr1ch.sadr), data, qwc*16);
 	spr1ch.sadr += qwc * 16;
 }

@ -381,7 +381,7 @@ void _SPR1interleave()
 		spr1ch.qwc = std::min(tqwc, qwc);
 		qwc -= spr1ch.qwc;
 		pMem = SPRdmaGetAddr(spr1ch.madr, false);
-		memcpy_qwc(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc);
+		memcpy(&psSu128(spr1ch.sadr), pMem, spr1ch.qwc*16);
 		spr1ch.sadr += spr1ch.qwc * 16;
 		spr1ch.madr += (sqwc + spr1ch.qwc) * 16;
 	}
--- a/pcsx2/SaveState.cpp
+++ b/pcsx2/SaveState.cpp
@ -126,7 +126,7 @@ SaveStateBase& SaveStateBase::FreezeBios()
 	pxToUTF8 utf8(BiosDescription);

 	memzero( biosdesc );
-	memcpy_fast( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) );
+	memcpy( biosdesc, utf8, std::min( sizeof(biosdesc), utf8.Length() ) );
 	
 	Freeze( bioscheck );
 	Freeze( biosdesc );
@ -282,7 +282,7 @@ void memSavingState::FreezeMem( void* data, int size )
 	if (!size) return;

 	m_memory->MakeRoomFor( m_idx + size );
-	memcpy_fast( m_memory->GetPtr(m_idx), data, size );
+	memcpy( m_memory->GetPtr(m_idx), data, size );
 	m_idx += size;
 }

@ -322,7 +322,7 @@ void memLoadingState::FreezeMem( void* data, int size )
 {
 	const u8* const src = m_memory->GetPtr(m_idx);
 	m_idx += size;
-	memcpy_fast( data, src, size );
+	memcpy( data, src, size );
 }

 // --------------------------------------------------------------------------------------
--- a/pcsx2/Sif.h
+++ b/pcsx2/Sif.h
@ -53,8 +53,8 @@ struct sifFifo
 			const int wP0 = std::min((FIFO_SIF_W - writePos), words);
 			const int wP1 = words - wP0;

-			memcpy_fast(&data[writePos], from, wP0 << 2);
-			memcpy_fast(&data[0], &from[wP0], wP1 << 2);
+			memcpy(&data[writePos], from, wP0 << 2);
+			memcpy(&data[0], &from[wP0], wP1 << 2);

 			writePos = (writePos + words) & (FIFO_SIF_W - 1);
 			size += words;
@ -69,8 +69,8 @@ struct sifFifo
 			const int wP0 = std::min((FIFO_SIF_W - readPos), words);
 			const int wP1 = words - wP0;

-			memcpy_fast(to, &data[readPos], wP0 << 2);
-			memcpy_fast(&to[wP0], &data[0], wP1 << 2);
+			memcpy(to, &data[readPos], wP0 << 2);
+			memcpy(&to[wP0], &data[0], wP1 << 2);

 			readPos = (readPos + words) & (FIFO_SIF_W - 1);
 			size -= words;
--- a/pcsx2/Sio.cpp
+++ b/pcsx2/Sio.cpp
@ -319,7 +319,7 @@ SIO_WRITE memcardErase(u8 data)
 			{
 			case 0x82: // Erase
 				//siomode = SIO_DUMMY; // Nothing more to do here.
-				memcpy_fast(sio.buf, &header[1], 4);
+				memcpy(sio.buf, &header[1], 4);
 				sio.bufSize = 3;
 				mcd->EraseBlock();
 				break;
@ -367,7 +367,7 @@ SIO_WRITE memcardWrite(u8 data)
 			switch(data)
 			{
 			case 0x42: // Write
-				memcpy_fast(sio.buf, header, 4);
+				memcpy(sio.buf, header, 4);
 				once = true;
 				break;

@ -375,7 +375,7 @@ SIO_WRITE memcardWrite(u8 data)
 				if(once)
 				{
 					siomode = SIO_DUMMY; // Nothing more to do here.
-					memcpy_fast(sio.buf, &header[1], 4);
+					memcpy(sio.buf, &header[1], 4);
 					sio.bufSize = 3;

 					sio2.packet.recvVal1 = 0x1600; // Writing
@ -454,7 +454,7 @@ SIO_WRITE memcardRead(u8 data)
 			switch(data)
 			{
 			case 0x43: // Read
-				memcpy_fast(sio.buf, header, 4);
+				memcpy(sio.buf, header, 4);
 				once = true;
 				break;

@ -462,7 +462,7 @@ SIO_WRITE memcardRead(u8 data)
 				if(once)
 				{
 					siomode = SIO_DUMMY; // Nothing more to do here.
-					memcpy_fast(sio.buf, &header[1], 4);
+					memcpy(sio.buf, &header[1], 4);
 					sio.bufSize = 3;

 					sio2.packet.recvVal1 = 0x1700; // Reading
@ -624,7 +624,7 @@ SIO_WRITE sioWriteMemcard(u8 data)
 				cmd.mc_xor				= info.Xor;
 				cmd.Z					= mcd->term;

-				memcpy_fast(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag));
+				memcpy(&sio.buf[2], &cmd, sizeof(mc_command_0x26_tag));
 			}
 			break;

@ -698,7 +698,7 @@ SIO_WRITE sioWriteMemcardPSX(u8 data)
 		{
 		case 0x53: // PSX 'S'tate // haven't seen it happen yet
 			sio.buf[1] = mcd->FLAG;
-			memcpy_fast(&sio.buf[2], memcard_psx, 8);
+			memcpy(&sio.buf[2], memcard_psx, 8);
 			siomode = SIO_DUMMY;
 			break;

--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@ -296,9 +296,9 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
 		if (!idx)  CpuVU0->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
 		else	   CpuVU1->Clear(addr, (idx ? 0x4000 : 0x1000) - addr);
 		
-		memcpy_fast(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr);
+		memcpy(VUx.Micro + addr, data, (idx ? 0x4000 : 0x1000) - addr);
 		size -= ((idx ? 0x4000 : 0x1000) - addr) / 4;
-		memcpy_fast(VUx.Micro, data, size);
+		memcpy(VUx.Micro, data, size);

 		vifX.tag.addr = size * 4;
 	}
@ -310,7 +310,7 @@ static __fi void _vifCode_MPG(int idx, u32 addr, const u32 *data, int size) {
 		// Clear VU memory before writing!
 		if (!idx)  CpuVU0->Clear(addr, size*4);
 		else	   CpuVU1->Clear(addr, size*4);
-		memcpy_fast(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy_fast
+		memcpy(VUx.Micro + addr, data, size*4); //from tests, memcpy is 1fps faster on Grandia 3 than memcpy

 		vifX.tag.addr   +=   size * 4;
 	}
--- a/pcsx2/gui/ConsoleLogger.cpp
+++ b/pcsx2/gui/ConsoleLogger.cpp
@ -584,7 +584,7 @@ bool ConsoleLogFrame::Write( ConsoleColors color, const wxString& text )

 	int endpos = m_CurQueuePos + text.Length();
 	m_QueueBuffer.MakeRoomFor( endpos + 1 );		// and the null!!
-	memcpy_fast( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() );
+	memcpy( &m_QueueBuffer[m_CurQueuePos], text.wc_str(), sizeof(wxChar) * text.Length() );
 	m_CurQueuePos = endpos;

 	// this NULL may be overwritten if the next message sent doesn't perform a color change.
--- a/pcsx2/x86/iVU1micro.cpp
+++ b/pcsx2/x86/iVU1micro.cpp
@ -126,17 +126,17 @@ namespace VU1micro
 #endif

 		runCount++;
-		memcpy_const((u8*)backVUregs, (u8*)&VU1,		sizeof(VURegs));
-		memcpy_const((u8*)backVUmem,	 (u8*)VU1.Mem,	0x4000);
+		memcpy((u8*)backVUregs, (u8*)&VU1,		sizeof(VURegs));
+		memcpy((u8*)backVUmem,	 (u8*)VU1.Mem,	0x4000);

 		do { // while loop needed since not always will return finished
 			SuperVUExecuteProgram(VU1.VI[ REG_TPC ].UL & 0x3fff, 1);
 		} while( VU0.VI[ REG_VPU_STAT ].UL&0x100 );

-		memcpy_const((u8*)cmpVUregs,	(u8*)&VU1,			sizeof(VURegs));
-		memcpy_const((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
-		memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-		memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+		memcpy((u8*)cmpVUregs,	(u8*)&VU1,			sizeof(VURegs));
+		memcpy((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
+		memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+		memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);

 		//Currently breaking mVU execution is disabled. Check mVUtestCycles<vuIndex>() in microVU_Compile.inl
 		runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);
@ -227,8 +227,8 @@ namespace VU1micro
 			if (mVUdebugNow) {

 				resetVUrec(1);
-				memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-				memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+				memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+				memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);

 				runVUrec(VU1.VI[REG_TPC].UL, 300000 /*0x7fffffff*/, 1);

--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -1016,8 +1016,8 @@ void SaveBranchState()
 	s_psaveInstInfo = g_pCurInstInfo;

 	// save all mmx regs
-	memcpy_const(s_saveMMXregs, mmxregs, sizeof(mmxregs));
-	memcpy_const(s_saveXMMregs, xmmregs, sizeof(xmmregs));
+	memcpy(s_saveMMXregs, mmxregs, sizeof(mmxregs));
+	memcpy(s_saveXMMregs, xmmregs, sizeof(xmmregs));
 }

 void LoadBranchState()
@ -1031,8 +1031,8 @@ void LoadBranchState()
 	g_pCurInstInfo = s_psaveInstInfo;

 	// restore all mmx regs
-	memcpy_const(mmxregs, s_saveMMXregs, sizeof(mmxregs));
-	memcpy_const(xmmregs, s_saveXMMregs, sizeof(xmmregs));
+	memcpy(mmxregs, s_saveMMXregs, sizeof(mmxregs));
+	memcpy(xmmregs, s_saveXMMregs, sizeof(xmmregs));
 }

 void iFlushCall(int flushtype)
@ -2179,7 +2179,7 @@ StartRecomp:
 			}
 		}

-		memcpy_fast(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc);
+		memcpy(&(*recRAMCopy)[HWADDR(startpc) / 4], PSM(startpc), pc - startpc);
 	}

 	s_pCurBlock->SetFnptr((uptr)recPtr);
--- a/pcsx2/x86/microVU.cpp
+++ b/pcsx2/x86/microVU.cpp
@ -181,8 +181,8 @@ __ri microProgram* mVUcreateProg(microVU& mVU, int startPC) {

 // Caches Micro Program
 __ri void mVUcacheProg(microVU& mVU, microProgram& prog) {
-	if (!mVU.index)	memcpy_const(prog.data, mVU.regs().Micro, 0x1000);
-	else			memcpy_const(prog.data, mVU.regs().Micro, 0x4000);
+	if (!mVU.index)	memcpy(prog.data, mVU.regs().Micro, 0x1000);
+	else			memcpy(prog.data, mVU.regs().Micro, 0x4000);
 	mVUdumpProg(mVU, prog);
 }

--- a/pcsx2/x86/microVU.h
+++ b/pcsx2/x86/microVU.h
@ -92,7 +92,7 @@ public:
 				blockEnd = blockList = newBlock;
 			}

-			memcpy_const(&newBlock->block, pBlock, sizeof(microBlock));
+			memcpy(&newBlock->block, pBlock, sizeof(microBlock));
 			thisBlock =  &newBlock->block;
 		}
 		return thisBlock;
--- a/pcsx2/x86/microVU_Branch.inl
+++ b/pcsx2/x86/microVU_Branch.inl
@ -170,7 +170,7 @@ void normBranchCompile(microVU& mVU, u32 branchPC) {
 }

 void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) {
-	memcpy_const(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
+	memcpy(&mVUpBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
 	mVUsetupBranch(mVU, mFC);
 	mVUbackupRegs(mVU);

@ -386,7 +386,7 @@ void condBranch(mV, microFlagCycles& mFC, int JMPcc) {
 			s32* ajmp = xJcc32((JccComparisonType)JMPcc); 
 			u32 bPC = iPC; // mVUcompile can modify iPC, mVUpBlock, and mVUregs so back them up
 			microBlock* pBlock = mVUpBlock;
-			memcpy_const(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));
+			memcpy(&pBlock->pStateEnd, &mVUregs, sizeof(microRegInfo));

 			incPC2(1);  // Get PC for branch not-taken
 			mVUcompile(mVU, xPC, (uptr)&mVUregs);
--- a/pcsx2/x86/microVU_Compile.inl
+++ b/pcsx2/x86/microVU_Compile.inl
@ -427,10 +427,10 @@ __fi void mVUinitFirstPass(microVU& mVU, uptr pState, u8* thisPtr) {
 	mVU.p					= 0;	// All blocks start at p index #0
 	mVU.q					= 0;	// All blocks start at q index #0
 	if ((uptr)&mVUregs != pState) {	// Loads up Pipeline State Info
-		memcpy_const((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo));
+		memcpy((u8*)&mVUregs, (u8*)pState, sizeof(microRegInfo));
 	}
 	if (doEarlyExit(mVU) && ((uptr)&mVU.prog.lpState != pState)) {
-		memcpy_const((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
+		memcpy((u8*)&mVU.prog.lpState, (u8*)pState, sizeof(microRegInfo));
 	}
 	mVUblock.x86ptrStart	= thisPtr;
 	mVUpBlock				= mVUblocks[mVUstartPC/2]->add(&mVUblock); // Add this block to block manager
@ -530,7 +530,7 @@ void mVUDoTBit(microVU& mVU, microFlagCycles* mFC)

 void mVUSaveFlags(microVU& mVU,microFlagCycles &mFC, microFlagCycles &mFCBackup)
 {
-	memcpy_fast(&mFCBackup, &mFC, sizeof(microFlagCycles));
+	memcpy(&mFCBackup, &mFC, sizeof(microFlagCycles));
 	mVUsetFlags(mVU, mFCBackup);	   // Sets Up Flag instances
 }
 void* mVUcompile(microVU& mVU, u32 startPC, uptr pState) {
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
@ -86,7 +86,7 @@ public:
 				wxsFormat(L"HashBucket Chain (bucket size=%d)", bucket.Size+1)
 			);
 		}
-		memcpy_const(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
+		memcpy(&bucket.Chain[bucket.Size++], &dataPtr, sizeof(T));
 	}
 	void clear() {
 		for (int i = 0; i < hSize; i++) {
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@ -117,7 +117,7 @@ _vifT int nVifUnpack(const u8* data) {

 	if (ret == vif.tag.size) { // Full Transfer
 		if (v.bSize) { // Last transfer was partial
-			memcpy_aligned(&v.buffer[v.bSize], data, size);
+			memcpy(&v.buffer[v.bSize], data, size);
 			v.bSize		+= size;
 			size        = v.bSize;
 			data		= v.buffer;
@ -140,7 +140,7 @@ _vifT int nVifUnpack(const u8* data) {
 		v.bSize			= 0;
 	}
 	else { // Partial Transfer
-		memcpy_aligned(&v.buffer[v.bSize], data, size);
+		memcpy(&v.buffer[v.bSize], data, size);
 		v.bSize		 += size;
 		vif.tag.size -= ret;

--- a/pcsx2/x86/sVU_Compare.h
+++ b/pcsx2/x86/sVU_Compare.h
@ -131,15 +131,15 @@ void recSuperVU1::Execute(u32 cycles) {
 #endif

 	runCount++;
-	memcpy_const((u8*)backVUregs,	(u8*)&VU1,		sizeof(VURegs));
-	memcpy_const((u8*)backVUmem,	(u8*) VU1.Mem,	0x4000);
+	memcpy((u8*)backVUregs,	(u8*)&VU1,		sizeof(VURegs));
+	memcpy((u8*)backVUmem,	(u8*) VU1.Mem,	0x4000);

 	runMVU1(cycles);

-	memcpy_const((u8*)cmpVUregs,(u8*)&VU1,			sizeof(VURegs));
-	memcpy_const((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
-	memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-	memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+	memcpy((u8*)cmpVUregs,(u8*)&VU1,			sizeof(VURegs));
+	memcpy((u8*)cmpVUmem,	(u8*)VU1.Mem,		0x4000);
+	memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+	memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);

 	runSVU1(cycles);
 	if ((memcmp((u8*)cmpVUregs, (u8*)&VU1, (16*32) + (16*16))) || (memcmp((u8*)cmpVUmem, (u8*)VU1.Mem, 0x4000))) {
@ -230,8 +230,8 @@ void recSuperVU1::Execute(u32 cycles) {

 			resetMVU1();
 			
-			memcpy_const((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
-			memcpy_const((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
+			memcpy((u8*)&VU1,		(u8*)backVUregs,	sizeof(VURegs));
+			memcpy((u8*)VU1.Mem,	(u8*)backVUmem,		0x4000);
 			
 			runMVU1(cycles);

--- a/pcsx2/x86/sVU_zerorec.cpp
+++ b/pcsx2/x86/sVU_zerorec.cpp
@ -898,7 +898,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
 #ifdef SUPERVU_CACHING
 		//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
 		r.pmem = malloc(r.size);
-		memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
+		memcpy(r.pmem, &VU->Micro[r.start], r.size);
 #endif
 		s_pFnHeader->ranges.push_back(r);
 	}
--- a/plugins/zerogs/dx/GS.h
+++ b/plugins/zerogs/dx/GS.h
@ -647,7 +647,6 @@ char *SysLibError();					// Gets previous error loading sysbols
 void SysCloseLibrary(void *lib);		// Closes Library
 void SysMessage(char *fmt, ...);

-extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
 extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);

 template <typename T>
--- a/plugins/zerogs/dx/Windows/zerogs.vcxproj
+++ b/plugins/zerogs/dx/Windows/zerogs.vcxproj
@ -148,7 +148,6 @@
    <ClCompile Include="Conf.cpp" />
    <ClCompile Include="..\GSmain.cpp" />
    <ClCompile Include="..\Mem.cpp" />
-    <ClCompile Include="..\memcpy_amd.cpp" />
    <ClCompile Include="..\Regs.cpp" />
    <ClCompile Include="..\targets.cpp" />
    <ClCompile Include="Win32.cpp" />
@ -199,4 +198,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters
+++ b/plugins/zerogs/dx/Windows/zerogs.vcxproj.filters
@ -24,9 +24,6 @@
    <ClCompile Include="..\Mem.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="..\memcpy_amd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="..\Regs.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -94,4 +91,4 @@
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj
@ -154,7 +154,6 @@
    <ClCompile Include="Conf.cpp" />
    <ClCompile Include="..\GSmain.cpp" />
    <ClCompile Include="..\Mem.cpp" />
-    <ClCompile Include="..\memcpy_amd.cpp" />
    <ClCompile Include="..\Regs.cpp" />
    <ClCompile Include="..\targets.cpp" />
    <ClCompile Include="Win32.cpp" />
@ -205,4 +204,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2012.vcxproj.filters
@ -24,9 +24,6 @@
    <ClCompile Include="..\Mem.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="..\memcpy_amd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="..\Regs.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -94,4 +91,4 @@
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj
@ -154,7 +154,6 @@
    <ClCompile Include="Conf.cpp" />
    <ClCompile Include="..\GSmain.cpp" />
    <ClCompile Include="..\Mem.cpp" />
-    <ClCompile Include="..\memcpy_amd.cpp" />
    <ClCompile Include="..\Regs.cpp" />
    <ClCompile Include="..\targets.cpp" />
    <ClCompile Include="Win32.cpp" />
@ -205,4 +204,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters
+++ b/plugins/zerogs/dx/Windows/zerogs_vs2013.vcxproj.filters
@ -24,9 +24,6 @@
    <ClCompile Include="..\Mem.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="..\memcpy_amd.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="..\Regs.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -94,4 +91,4 @@
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
-</Project>
+</Project>
--- a/plugins/zerogs/dx/memcpy_amd.cpp
+++ b/plugins/zerogs/dx/memcpy_amd.cpp
@ -1,479 +0,0 @@
-/******************************************************************************
-
- Copyright (c) 2001 Advanced Micro Devices, Inc.
-
- LIMITATION OF LIABILITY:  THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
- EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
- NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
- PARTICULAR PURPOSE.  IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
- DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
- BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
- INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
- OF SUCH DAMAGES.  BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
- OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
- NOT APPLY TO YOU.
-
- AMD does not assume any responsibility for any errors which may appear in the
- Materials nor any responsibility to support or update the Materials.  AMD retains
- the right to make changes to its test specifications at any time, without notice.
-
- NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
- further information, software, technical information, know-how, or show-how
- available to you.
-
- So that all may benefit from your experience, please report  any  problems
- or  suggestions about this software to 3dsdk.support@amd.com
-
- AMD Developer Technologies, M/S 585
- Advanced Micro Devices, Inc.
- 5900 E. Ben White Blvd.
- Austin, TX 78741
- 3dsdk.support@amd.com
-******************************************************************************/
-
-#include <assert.h>
-
-/*****************************************************************************
-MEMCPY_AMD.CPP
-******************************************************************************/
-
-// Very optimized memcpy() routine for AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
-// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
-// "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on Athlon/Duron or other recent CPU before calling!
-
-#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".
-
-#define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
-// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
-// also using the "unrolled loop" optimization.   This code uses
-// the software prefetch instruction to get the data into the cache.
-
-#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
-
-#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
-#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-//#include <stddef.h>
-
-// Inline assembly syntax for use with Visual C++
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-#include "PS2Etypes.h"
-
-extern "C" {
-
-#if defined(_MSC_VER) && !defined(__x86_64__)
-
-void * memcpy_amd(void *dest, const void *src, size_t n)
-{
-    __asm {
-	mov		ecx, [n]		; number of bytes to copy
-	mov		edi, [dest]		; destination
-	mov		esi, [src]		; source
-	mov		ebx, ecx		; keep a copy of count
-
-	cld
-	cmp		ecx, TINY_BLOCK_COPY
-	jb		$memcpy_ic_3	; tiny? skip mmx copy
-
-	cmp		ecx, 32*1024		; don't align between 32k-64k because
-	jbe		$memcpy_do_align	;  it appears to be slower
-	cmp		ecx, 64*1024
-	jbe		$memcpy_align_done
-$memcpy_do_align:
-	mov		ecx, 8			; a trick that's faster than rep movsb...
-	sub		ecx, edi		; align destination to qword
-	and		ecx, 111b		; get the low bits
-	sub		ebx, ecx		; update copy count
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_align_done
-	jmp		ecx				; jump to array of movsb's
-
-align 4
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-
-$memcpy_align_done:			; destination is dword aligned
-	mov		ecx, ebx		; number of bytes left to copy
-	shr		ecx, 6			; get 64-byte block count
-	jz		$memcpy_ic_2	; finish the last few bytes
-
-	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
-	jae		$memcpy_uc_test
-
-// This is small block copy that uses the MMX registers to copy 8 bytes
-// at a time.  It uses the "unrolled loop" optimization, and also uses
-// the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1:			; 64-byte block copies, in-cache copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0, [esi+0]	; read 64 bits
-	movq	mm1, [esi+8]
-	movq	[edi+0], mm0	; write 64 bits
-	movq	[edi+8], mm1	;    note:  the normal movq writes the
-	movq	mm2, [esi+16]	;    data to cache; a cache line will be
-	movq	mm3, [esi+24]	;    allocated as needed, to store the data
-	movq	[edi+16], mm2
-	movq	[edi+24], mm3
-	movq	mm0, [esi+32]
-	movq	mm1, [esi+40]
-	movq	[edi+32], mm0
-	movq	[edi+40], mm1
-	movq	mm2, [esi+48]
-	movq	mm3, [esi+56]
-	movq	[edi+48], mm2
-	movq	[edi+56], mm3
-
-	add		esi, 64			; update source pointer
-	add		edi, 64			; update destination pointer
-	dec		ecx				; count down
-	jnz		$memcpy_ic_1	; last 64-byte block?
-
-$memcpy_ic_2:
-	mov		ecx, ebx		; has valid low 6 bits of the byte count
-$memcpy_ic_3:
-	shr		ecx, 2			; dword count
-	and		ecx, 1111b		; only look at the "remainder" bits
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_last_few
-	jmp		ecx				; jump to array of movsd's
-
-$memcpy_uc_test:
-	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
-	jae		$memcpy_bp_1
-
-$memcpy_64_test:
-	or		ecx, ecx		; tail end of block prefetch will jump here
-	jz		$memcpy_ic_2	; no more 64-byte blocks left
-
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-align 16
-$memcpy_uc_1:				; 64-byte blocks, uncached copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0,[esi+0]		; read 64 bits
-	add		edi,64			; update destination pointer
-	movq	mm1,[esi+8]
-	add		esi,64			; update source pointer
-	movq	mm2,[esi-48]
-	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
-	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
-	movntq	[edi-56], mm1	;    from READING the destination address
-	movq	mm1,[esi-32]	;    into the cache, only to be over-written
-	movntq	[edi-48], mm2	;    so that also helps performance
-	movq	mm2,[esi-24]
-	movntq	[edi-40], mm0
-	movq	mm0,[esi-16]
-	movntq	[edi-32], mm1
-	movq	mm1,[esi-8]
-	movntq	[edi-24], mm2
-	movntq	[edi-16], mm0
-	dec		ecx
-	movntq	[edi-8], mm1
-	jnz		$memcpy_uc_1	; last 64-byte block?
-
-	jmp		$memcpy_ic_2		; almost done
-
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-$memcpy_bp_1:			; large blocks, block prefetch copy
-
-	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
-	jl		$memcpy_64_test			; no, back to regular uncached copy
-
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-$memcpy_bp_2:
-	mov		edx, [esi-64]		; grab one address per cache line
-	mov		edx, [esi-128]		; grab one address per cache line
-	sub		esi, 128			; go reverse order to suppress HW prefetcher
-	dec		eax					; count down the cache lines
-	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
-
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
-$memcpy_bp_3:
-	movq	mm0, [esi   ]		; read 64 bits
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
-	movq	mm4, [esi+32]
-	movq	mm5, [esi+40]
-	movq	mm6, [esi+48]
-	movq	mm7, [esi+56]
-	add		esi, 64				; update source pointer
-	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
-	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
-	movntq	[edi+16], mm2		;    from READING the destination address
-	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
-	movntq	[edi+32], mm4		;    so that also helps performance
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, 64				; update dest pointer
-
-	dec		eax					; count down
-
-	jnz		$memcpy_bp_3		; keep copying
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	jmp		$memcpy_bp_1		; keep processing chunks
-
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".   Then it handles the last few bytes.
-align 4
-	movsd
-	movsd			; perform last 1-15 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd			; perform last 1-7 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-
-$memcpy_last_few:		; dword aligned from before movsd's
-	mov		ecx, ebx	; has valid low 2 bits of the byte count
-	and		ecx, 11b	; the last few cows must come home
-	jz		$memcpy_final	; no more, let's leave
-	rep		movsb		; the last 1, 2, or 3 bytes
-
-$memcpy_final:
-	emms				; clean up the MMX state
-	sfence				; flush the write buffer
-	mov		eax, [dest]	; ret value = destination pointer
-
-    }
-}
-
-// mmx memcpy implementation, size has to be a multiple of 8
-// returns 0 is equal, nonzero value if not equal
-// ~10 times faster than standard memcmp
-// (zerofrog)
-u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
-{
-	assert( (cmpsize&7) == 0 );
-
-	__asm {
-push esi
-		mov ecx, cmpsize
-		mov edx, src1
-		mov esi, src2
-
-		cmp ecx, 32
-		jl Done4
-
-		// custom test first 8 to make sure things are ok
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pand mm0, mm1
-		movq mm2, [esi+16]
-		pmovmskb eax, mm0
-		movq mm3, [esi+24]
-
-		// check if eq
-		cmp eax, 0xff
-		je NextComp
-		mov eax, 1
-		jmp End
-
-NextComp:
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm2, mm3
-		pmovmskb eax, mm2
-
-		sub ecx, 32
-		add esi, 32
-		add edx, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je ContinueTest
-		mov eax, 1
-		jmp End
-
-		cmp ecx, 64
-		jl Done8
-
-Cmp8:
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		movq mm3, [esi+24]
-		movq mm4, [esi+32]
-		movq mm5, [esi+40]
-		movq mm6, [esi+48]
-		movq mm7, [esi+56]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm0, mm1
-		pcmpeqd mm4, [edx+32]
-		pand mm0, mm2
-		pcmpeqd mm5, [edx+40]
-		pand mm0, mm3
-		pcmpeqd mm6, [edx+48]
-		pand mm0, mm4
-		pcmpeqd mm7, [edx+56]
-		pand mm0, mm5
-		pand mm0, mm6
-		pand mm0, mm7
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		je Continue
-		mov eax, 1
-		jmp End
-
-Continue:
-		sub ecx, 64
-		add esi, 64
-		add edx, 64
-ContinueTest:
-		cmp ecx, 64
-		jge Cmp8
-
-Done8:
-		test ecx, 0x20
-		jz Done4
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		movq mm3, [esi+24]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm0, mm1
-		pand mm0, mm2
-		pand mm0, mm3
-		pmovmskb eax, mm0
-		sub ecx, 32
-		add esi, 32
-		add edx, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je Done4
-		mov eax, 1
-		jmp End
-
-Done4:
-		cmp ecx, 24
-		jne Done2
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pand mm0, mm1
-		pand mm0, mm2
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		setne al
-		jmp End
-
-Done2:
-		cmp ecx, 16
-		jne Done1
-
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pand mm0, mm1
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		setne al
-		jmp End
-
-Done1:
-		cmp ecx, 8
-		jne Done
-
-		mov eax, [esi]
-		mov esi, [esi+4]
-		cmp eax, [edx]
-		je Next
-		mov eax, 1
-		jmp End
-
-Next:
-		cmp esi, [edx+4]
-		setne al
-		jmp End
-
-Done:
-		xor eax, eax
-
-End:
-		pop esi
-		emms
-	}
-}
-
-#else // _MSC_VER
-// assume gcc or mingw or win x64
-
-#include <memory.h>
-#include <string.h>
-
-void * memcpy_amd(void *dest, const void *src, size_t n)
-{
-memcpy(dest, src, n);
-return dest;
-}
-
-
-#endif
-
-}
--- a/plugins/zerogs/dx/targets.cpp
+++ b/plugins/zerogs/dx/targets.cpp
@ -2026,7 +2026,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 		targ->clut.resize(clutsize);

 		if( tex0.cpsm <= 1 ) { // 32 bit
-			memcpy_amd(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
+			memcpy(&targ->clut[0], ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
 		}
 		else {
 			u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
@ -2110,7 +2110,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 			targ->memory->ref = 1;
 		}

-		memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
+		memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);

 		u8* psrc = (u8*)(ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy);

@ -2136,7 +2136,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 				targ->memory->ref = 1;
 			}

-			memcpy_amd(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
+			memcpy(targ->memory->ptr, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);

 			// needs to be 8 bit, use xmm for unpacking
 			u16* dst = (u16*)lock.pBits;
@ -2219,7 +2219,7 @@ Z16Loop:
 				targ->memory = NULL;
 			}

-			memcpy_amd(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height );
+			memcpy(lock.pBits, ZeroGS::g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height );
 		}
 	}

--- a/plugins/zerogs/dx/zerogs.cpp
+++ b/plugins/zerogs/dx/zerogs.cpp
@ -2239,7 +2239,7 @@ void ZeroGS::Flush(int context)
 					}

 					if( curvb.tex0.cpsm <= 1 ) { // 32 bit
-						memcpy_amd(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
+						memcpy(lock.pBits, ZeroGS::g_pbyGSClut+nClutOffset, clutsize);
 					}
 					else {
 						u16* pClutBuffer = (u16*)(ZeroGS::g_pbyGSClut + nClutOffset);
@ -5087,7 +5087,7 @@ void ZeroGS::CaptureFrame()

 	BYTE* pend = (BYTE*)lock.pBits + (conf.height-1)*width*4;
 	for(int i = 0; i < conf.height; ++i) {
-		memcpy_amd(&mem[width*4*i], pend - width*4*i, width * 4);
+		memcpy(&mem[width*4*i], pend - width*4*i, width * 4);
 	}
 	s_ptexAVICapture->UnlockRect();

--- a/plugins/zerogs/opengl/CMakeLists.txt
+++ b/plugins/zerogs/opengl/CMakeLists.txt
@ -36,7 +36,6 @@ set(zerogsSources
 	GSmain.cpp
 	GLWinX11.cpp
 	Mem.cpp
-	memcpy_amd.cpp
 	rasterfont.cpp
 	Regs.cpp
 	targets.cpp
--- a/plugins/zerogs/opengl/GS.h
+++ b/plugins/zerogs/opengl/GS.h
@ -728,7 +728,6 @@ char *SysLibError();					// Gets previous error loading sysbols
 void SysCloseLibrary(void *lib);		// Closes Library
 void SysMessage(char *fmt, ...);

-extern "C" void * memcpy_amd(void *dest, const void *src, size_t n);
 extern "C" u8 memcmp_mmx(const void *dest, const void *src, int n);

 template <typename T>
--- a/plugins/zerogs/opengl/Makefile.am
+++ b/plugins/zerogs/opengl/Makefile.am
@ -23,7 +23,7 @@ libZeroGSogl_LDFLAGS+=-Wl,-soname,@ZEROGS_SONAME@
 libZeroGSogl_LDADD=$(libZeroGSogl_a_OBJECTS)

 libZeroGSogl_a_SOURCES = \
-GSmain.cpp  memcpy_amd.cpp  Regs.cpp     x86.cpp     zpipe.cpp \
+GSmain.cpp  Regs.cpp         x86.cpp     zpipe.cpp \
 Mem.cpp     rasterfont.cpp  targets.cpp  zerogs.cpp GifTransfer.cpp GLWinX11.cpp

 libZeroGSogl_a_SOURCES += x86-32.S
--- a/plugins/zerogs/opengl/memcpy_amd.cpp
+++ b/plugins/zerogs/opengl/memcpy_amd.cpp
@ -1,478 +0,0 @@
-/******************************************************************************
-
- Copyright (c) 2001 Advanced Micro Devices, Inc.
-
- LIMITATION OF LIABILITY:  THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
- EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
- NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
- PARTICULAR PURPOSE.  IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
- DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
- BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
- INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
- OF SUCH DAMAGES.  BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
- OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
- NOT APPLY TO YOU.
-
- AMD does not assume any responsibility for any errors which may appear in the
- Materials nor any responsibility to support or update the Materials.  AMD retains
- the right to make changes to its test specifications at any time, without notice.
-
- NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
- further information, software, technical information, know-how, or show-how
- available to you.
-
- So that all may benefit from your experience, please report  any  problems
- or  suggestions about this software to 3dsdk.support@amd.com
-
- AMD Developer Technologies, M/S 585
- Advanced Micro Devices, Inc.
- 5900 E. Ben White Blvd.
- Austin, TX 78741
- 3dsdk.support@amd.com
-******************************************************************************/
-
-#include <assert.h>
-
-/*****************************************************************************
-MEMCPY_AMD.CPP
-******************************************************************************/
-
-// Very optimized memcpy() routine for AMD Athlon and Duron family.
-// This code uses any of FOUR different basic copy methods, depending
-// on the transfer size.
-// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
-// "Streaming Store"), and also uses the software prefetch instructions,
-// be sure you're running on Athlon/Duron or other recent CPU before calling!
-
-#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".
-
-#define IN_CACHE_COPY 2 * 1024  // upper limit for movq/movq copy w/SW prefetch
-// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
-// also using the "unrolled loop" optimization.   This code uses
-// the software prefetch instruction to get the data into the cache.
-
-#define UNCACHED_COPY 4 * 1024 // upper limit for movq/movntq w/SW prefetch
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
-
-#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
-#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-
-//#include <stddef.h>
-
-// Inline assembly syntax for use with Visual C++
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-extern "C" {
-#include "PS2Etypes.h"
-
-#if defined(_MSC_VER)
-
-void * memcpy_amd(void *dest, const void *src, size_t n)
-{
-    __asm {
-	mov		ecx, [n]		; number of bytes to copy
-	mov		edi, [dest]		; destination
-	mov		esi, [src]		; source
-	mov		ebx, ecx		; keep a copy of count
-
-	cld
-	cmp		ecx, TINY_BLOCK_COPY
-	jb		$memcpy_ic_3	; tiny? skip mmx copy
-
-	cmp		ecx, 32*1024		; don't align between 32k-64k because
-	jbe		$memcpy_do_align	;  it appears to be slower
-	cmp		ecx, 64*1024
-	jbe		$memcpy_align_done
-$memcpy_do_align:
-	mov		ecx, 8			; a trick that's faster than rep movsb...
-	sub		ecx, edi		; align destination to qword
-	and		ecx, 111b		; get the low bits
-	sub		ebx, ecx		; update copy count
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_align_done
-	jmp		ecx				; jump to array of movsb's
-
-align 4
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-	movsb
-
-$memcpy_align_done:			; destination is dword aligned
-	mov		ecx, ebx		; number of bytes left to copy
-	shr		ecx, 6			; get 64-byte block count
-	jz		$memcpy_ic_2	; finish the last few bytes
-
-	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
-	jae		$memcpy_uc_test
-
-// This is small block copy that uses the MMX registers to copy 8 bytes
-// at a time.  It uses the "unrolled loop" optimization, and also uses
-// the software prefetch instruction to get the data into the cache.
-align 16
-$memcpy_ic_1:			; 64-byte block copies, in-cache copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0, [esi+0]	; read 64 bits
-	movq	mm1, [esi+8]
-	movq	[edi+0], mm0	; write 64 bits
-	movq	[edi+8], mm1	;    note:  the normal movq writes the
-	movq	mm2, [esi+16]	;    data to cache; a cache line will be
-	movq	mm3, [esi+24]	;    allocated as needed, to store the data
-	movq	[edi+16], mm2
-	movq	[edi+24], mm3
-	movq	mm0, [esi+32]
-	movq	mm1, [esi+40]
-	movq	[edi+32], mm0
-	movq	[edi+40], mm1
-	movq	mm2, [esi+48]
-	movq	mm3, [esi+56]
-	movq	[edi+48], mm2
-	movq	[edi+56], mm3
-
-	add		esi, 64			; update source pointer
-	add		edi, 64			; update destination pointer
-	dec		ecx				; count down
-	jnz		$memcpy_ic_1	; last 64-byte block?
-
-$memcpy_ic_2:
-	mov		ecx, ebx		; has valid low 6 bits of the byte count
-$memcpy_ic_3:
-	shr		ecx, 2			; dword count
-	and		ecx, 1111b		; only look at the "remainder" bits
-	neg		ecx				; set up to jump into the array
-	add		ecx, offset $memcpy_last_few
-	jmp		ecx				; jump to array of movsd's
-
-$memcpy_uc_test:
-	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
-	jae		$memcpy_bp_1
-
-$memcpy_64_test:
-	or		ecx, ecx		; tail end of block prefetch will jump here
-	jz		$memcpy_ic_2	; no more 64-byte blocks left
-
-// For larger blocks, which will spill beyond the cache, it's faster to
-// use the Streaming Store instruction MOVNTQ.   This write instruction
-// bypasses the cache and writes straight to main memory.  This code also
-// uses the software prefetch instruction to pre-read the data.
-align 16
-$memcpy_uc_1:				; 64-byte blocks, uncached copy
-
-	prefetchnta [esi + (200*64/34+192)]		; start reading ahead
-
-	movq	mm0,[esi+0]		; read 64 bits
-	add		edi,64			; update destination pointer
-	movq	mm1,[esi+8]
-	add		esi,64			; update source pointer
-	movq	mm2,[esi-48]
-	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
-	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
-	movntq	[edi-56], mm1	;    from READING the destination address
-	movq	mm1,[esi-32]	;    into the cache, only to be over-written
-	movntq	[edi-48], mm2	;    so that also helps performance
-	movq	mm2,[esi-24]
-	movntq	[edi-40], mm0
-	movq	mm0,[esi-16]
-	movntq	[edi-32], mm1
-	movq	mm1,[esi-8]
-	movntq	[edi-24], mm2
-	movntq	[edi-16], mm0
-	dec		ecx
-	movntq	[edi-8], mm1
-	jnz		$memcpy_uc_1	; last 64-byte block?
-
-	jmp		$memcpy_ic_2		; almost done
-
-// For the largest size blocks, a special technique called Block Prefetch
-// can be used to accelerate the read operations.   Block Prefetch reads
-// one address per cache line, for a series of cache lines, in a short loop.
-// This is faster than using software prefetch.  The technique is great for
-// getting maximum read bandwidth, especially in DDR memory systems.
-$memcpy_bp_1:			; large blocks, block prefetch copy
-
-	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
-	jl		$memcpy_64_test			; no, back to regular uncached copy
-
-	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
-	add		esi, CACHEBLOCK * 64	; move to the top of the block
-align 16
-$memcpy_bp_2:
-	mov		edx, [esi-64]		; grab one address per cache line
-	mov		edx, [esi-128]		; grab one address per cache line
-	sub		esi, 128			; go reverse order to suppress HW prefetcher
-	dec		eax					; count down the cache lines
-	jnz		$memcpy_bp_2		; keep grabbing more lines into cache
-
-	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
-align 16
-$memcpy_bp_3:
-	movq	mm0, [esi   ]		; read 64 bits
-	movq	mm1, [esi+ 8]
-	movq	mm2, [esi+16]
-	movq	mm3, [esi+24]
-	movq	mm4, [esi+32]
-	movq	mm5, [esi+40]
-	movq	mm6, [esi+48]
-	movq	mm7, [esi+56]
-	add		esi, 64				; update source pointer
-	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
-	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
-	movntq	[edi+16], mm2		;    from READING the destination address
-	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
-	movntq	[edi+32], mm4		;    so that also helps performance
-	movntq	[edi+40], mm5
-	movntq	[edi+48], mm6
-	movntq	[edi+56], mm7
-	add		edi, 64				; update dest pointer
-
-	dec		eax					; count down
-
-	jnz		$memcpy_bp_3		; keep copying
-	sub		ecx, CACHEBLOCK		; update the 64-byte block count
-	jmp		$memcpy_bp_1		; keep processing chunks
-
-// The smallest copy uses the X86 "movsd" instruction, in an optimized
-// form which is an "unrolled loop".   Then it handles the last few bytes.
-align 4
-	movsd
-	movsd			; perform last 1-15 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd			; perform last 1-7 dword copies
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-	movsd
-
-$memcpy_last_few:		; dword aligned from before movsd's
-	mov		ecx, ebx	; has valid low 2 bits of the byte count
-	and		ecx, 11b	; the last few cows must come home
-	jz		$memcpy_final	; no more, let's leave
-	rep		movsb		; the last 1, 2, or 3 bytes
-
-$memcpy_final:
-	emms				; clean up the MMX state
-	sfence				; flush the write buffer
-	mov		eax, [dest]	; ret value = destination pointer
-
-    }
-}
-
-// mmx memcpy implementation, size has to be a multiple of 8
-// returns 0 is equal, nonzero value if not equal
-// ~10 times faster than standard memcmp
-// (zerofrog)
-u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize)
-{
-	assert( (cmpsize&7) == 0 );
-
-	__asm {
-push esi
-		mov ecx, cmpsize
-		mov edx, src1
-		mov esi, src2
-
-		cmp ecx, 32
-		jl Done4
-
-		// custom test first 8 to make sure things are ok
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pand mm0, mm1
-		movq mm2, [esi+16]
-		pmovmskb eax, mm0
-		movq mm3, [esi+24]
-
-		// check if eq
-		cmp eax, 0xff
-		je NextComp
-		mov eax, 1
-		jmp End
-
-NextComp:
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm2, mm3
-		pmovmskb eax, mm2
-
-		sub ecx, 32
-		add esi, 32
-		add edx, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je ContinueTest
-		mov eax, 1
-		jmp End
-
-		cmp ecx, 64
-		jl Done8
-
-Cmp8:
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		movq mm3, [esi+24]
-		movq mm4, [esi+32]
-		movq mm5, [esi+40]
-		movq mm6, [esi+48]
-		movq mm7, [esi+56]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm0, mm1
-		pcmpeqd mm4, [edx+32]
-		pand mm0, mm2
-		pcmpeqd mm5, [edx+40]
-		pand mm0, mm3
-		pcmpeqd mm6, [edx+48]
-		pand mm0, mm4
-		pcmpeqd mm7, [edx+56]
-		pand mm0, mm5
-		pand mm0, mm6
-		pand mm0, mm7
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		je Continue
-		mov eax, 1
-		jmp End
-
-Continue:
-		sub ecx, 64
-		add esi, 64
-		add edx, 64
-ContinueTest:
-		cmp ecx, 64
-		jge Cmp8
-
-Done8:
-		test ecx, 0x20
-		jz Done4
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		movq mm3, [esi+24]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pcmpeqd mm3, [edx+24]
-		pand mm0, mm1
-		pand mm0, mm2
-		pand mm0, mm3
-		pmovmskb eax, mm0
-		sub ecx, 32
-		add esi, 32
-		add edx, 32
-
-		// check if eq
-		cmp eax, 0xff
-		je Done4
-		mov eax, 1
-		jmp End
-
-Done4:
-		cmp ecx, 24
-		jne Done2
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		movq mm2, [esi+16]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pcmpeqd mm2, [edx+16]
-		pand mm0, mm1
-		pand mm0, mm2
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		setne al
-		jmp End
-
-Done2:
-		cmp ecx, 16
-		jne Done1
-
-		movq mm0, [esi]
-		movq mm1, [esi+8]
-		pcmpeqd mm0, [edx]
-		pcmpeqd mm1, [edx+8]
-		pand mm0, mm1
-		pmovmskb eax, mm0
-
-		// check if eq
-		cmp eax, 0xff
-		setne al
-		jmp End
-
-Done1:
-		cmp ecx, 8
-		jne Done
-
-		mov eax, [esi]
-		mov esi, [esi+4]
-		cmp eax, [edx]
-		je Next
-		mov eax, 1
-		jmp End
-
-Next:
-		cmp esi, [edx+4]
-		setne al
-		jmp End
-
-Done:
-		xor eax, eax
-
-End:
-		pop esi
-		emms
-	}
-}
-
-#else // _MSC_VER
-// assume gcc
-
-#include <memory.h>
-#include <string.h>
-
-void * memcpy_amd(void *dest, const void *src, size_t n)
-{
-memcpy(dest, src, n);
-return dest;
-}
-
-
-#endif
-
-}
--- a/plugins/zerogs/opengl/targets.cpp
+++ b/plugins/zerogs/opengl/targets.cpp
@ -1789,7 +1789,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 		targ->clut.resize(clutsize);

 		if( tex0.cpsm <= 1 ) { // 32 bit
-			memcpy_amd(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize);
+			memcpy(&targ->clut[0], g_pbyGSClut+nClutOffset, clutsize);
 		}
 		else {
 			u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
@ -1854,7 +1854,7 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 		assert(targ->ptex->ref > 0 );
 	}

-	memcpy_amd(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
+	memcpy(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
 	vector<u8> texdata;
 	u8* ptexdata = NULL;

--- a/plugins/zerogs/opengl/zerogs.cpp
+++ b/plugins/zerogs/opengl/zerogs.cpp
@ -2568,7 +2568,7 @@ void ZeroGS::Flush(int context)
 	g_nCurVBOIndex = (g_nCurVBOIndex+1)%g_vboBuffers.size();
 	glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
 //	void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-//	memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
+//	memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
 //	glUnmapBuffer(GL_ARRAY_BUFFER);
 	SET_STREAM();

@ -2652,7 +2652,7 @@ void ZeroGS::Flush(int context)
 					}

 					if( curvb.tex0.cpsm <= 1 ) { // 32 bit
-						memcpy_amd(&data[0], g_pbyGSClut+nClutOffset, clutsize);
+						memcpy(&data[0], g_pbyGSClut+nClutOffset, clutsize);
 					}
 					else {
 						u16* pClutBuffer = (u16*)(g_pbyGSClut + nClutOffset);
@ -5779,7 +5779,7 @@ void ZeroGS::CaptureFrame()

 //  u8* pend = (u8*)&data[0] + (nBackbufferHeight-1)*nBackbufferWidth*4;
 //  for(int i = 0; i < conf.height; ++i) {
-//	  memcpy_amd(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4);
+//	  memcpy(&mem[nBackbufferWidth*4*i], pend - nBackbufferWidth*4*i, nBackbufferWidth * 4);
 //  }

 	int fps = SMODE1->CMOD == 3 ? 50 : 60;
--- a/plugins/zerogs/opengl/zerogs.h
+++ b/plugins/zerogs/opengl/zerogs.h
@ -436,7 +436,7 @@ namespace ZeroGS {
 			if( nCount + nVerts > nNumVertices ) {
 				// recreate except with a bigger count
 				VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU)*nNumVertices*2, 256);
-				memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
+				memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
 				nNumVertices *= 2;
 				assert( nCount + nVerts <= nNumVertices );
 				_aligned_free(pBufferData);
--- a/plugins/zzogl-pg-cg/opengl/CMakeLists.txt
+++ b/plugins/zzogl-pg-cg/opengl/CMakeLists.txt
@ -55,7 +55,6 @@ set(zzoglSources
    GSmain.cpp
    HostMemory.cpp
    Mem.cpp
-    # memcpy_amd.cpp
    Mem_Swizzle.cpp
    Mem_Tables.cpp
    Profile.cpp
--- a/plugins/zzogl-pg-cg/opengl/Util.h
+++ b/plugins/zzogl-pg-cg/opengl/Util.h
@ -68,7 +68,6 @@ extern "C" char* CALLBACK PS2EgetLibName(void);
 #include "GSDump.h"

 #include "Utilities/MemcpyFast.h"
-#define memcpy_amd memcpy_fast

 extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path

--- a/plugins/zzogl-pg-cg/opengl/ZZClut.cpp
+++ b/plugins/zzogl-pg-cg/opengl/ZZClut.cpp
@ -493,7 +493,7 @@ template <>
 /*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
 {
    u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
-    memcpy_amd((u8*)dst, clut, clutsize);
+    memcpy((u8*)dst, clut, clutsize);
 }

 template <>
--- a/plugins/zzogl-pg-cg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg-cg/opengl/ZZoglFlush.cpp
@ -657,7 +657,7 @@ inline void FlushSetStream(VB& curvb)
 	g_nCurVBOIndex = (g_nCurVBOIndex + 1) % g_vboBuffers.size();
 	glBufferData(GL_ARRAY_BUFFER, curvb.nCount * sizeof(VertexGPU), curvb.pBufferData, GL_STREAM_DRAW);
 //	void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-//	memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
+//	memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
 //	glUnmapBuffer(GL_ARRAY_BUFFER);
 	SET_STREAM();
 	
--- a/plugins/zzogl-pg-cg/opengl/ZZoglVB.h
+++ b/plugins/zzogl-pg-cg/opengl/ZZoglVB.h
@ -89,7 +89,7 @@ class VB
 			assert(pBufferData != NULL);
 			nNumVertices *= 2;
 			VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
-			memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
+			memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
 			assert(nCount <= nNumVertices);
 			_aligned_free(pBufferData);
 			pBufferData = ptemp;
--- a/plugins/zzogl-pg-cg/opengl/targets.cpp
+++ b/plugins/zzogl-pg-cg/opengl/targets.cpp
@ -1979,7 +1979,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 		assert(targ->ptex->ref > 0);
 	}

-	memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
+	memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));

 	__aligned16 u8* ptexdata = NULL;
 	bool has_data = false;
--- a/plugins/zzogl-pg/opengl/Util.h
+++ b/plugins/zzogl-pg/opengl/Util.h
@ -86,7 +86,6 @@ typedef signed long long int64;
 #include "GSDump.h"

 #include "Utilities/MemcpyFast.h"
-#define memcpy_amd memcpy_fast

 extern wxString s_strIniPath; // Air's new (r2361) new constant for ini file path

--- a/plugins/zzogl-pg/opengl/ZZClut.cpp
+++ b/plugins/zzogl-pg/opengl/ZZClut.cpp
@ -489,7 +489,7 @@ template <>
 /*__forceinline*/ void ClutBuffer_to_Array<u32>(u32* dst, u32 csa, u32 clutsize)
 {
    u8* clut = (u8*)GetClutBufferAddress<u32>(csa);
-    memcpy_amd((u8*)dst, clut, clutsize);
+    memcpy((u8*)dst, clut, clutsize);
 }

 template <>
--- a/plugins/zzogl-pg/opengl/ZZMemoryTargets.cpp
+++ b/plugins/zzogl-pg/opengl/ZZMemoryTargets.cpp
@ -364,7 +364,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 		assert(targ->ptex->ref > 0);
 	}

-	memcpy_amd(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));
+	memcpy(targ->ptex->memptr, MemoryAddress(targ->realy), MemorySize(targ->height));

 	__aligned16 u8* ptexdata = NULL;
 	bool has_data = false;
--- a/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
+++ b/plugins/zzogl-pg/opengl/ZZoglFlush.cpp
@ -535,7 +535,7 @@ inline void FlushSetStream(VB& curvb)


 //	void* pdata = glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-//	memcpy_amd(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
+//	memcpy(pdata, curvb.pBufferData, curvb.nCount * sizeof(VertexGPU));
 //	glUnmapBuffer(GL_ARRAY_BUFFER);
 	SET_STREAM();
 	
--- a/plugins/zzogl-pg/opengl/ZZoglVB.h
+++ b/plugins/zzogl-pg/opengl/ZZoglVB.h
@ -89,7 +89,7 @@ class VB
 			assert(pBufferData != NULL);
 			nNumVertices *= 2;
 			VertexGPU* ptemp = (VertexGPU*)_aligned_malloc(sizeof(VertexGPU) * nNumVertices, 256);
-			memcpy_amd(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
+			memcpy(ptemp, pBufferData, sizeof(VertexGPU) * nCount);
 			assert(nCount <= nNumVertices);
 			_aligned_free(pBufferData);
 			pBufferData = ptemp;