diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index fccbd2770e..618ec40b18 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -870,35 +870,49 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src) src.WriteRest(this); } -void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src) +void XEmitter::WriteMOVBE(int bits, u8 op, X64Reg reg, OpArg arg) { _assert_msg_(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it."); if (bits == 8) { - MOV(bits, dest, src); + MOV(8, op & 1 ? arg : R(reg), op & 1 ? R(reg) : arg); return; } - if (bits == 16) Write8(0x66); + _assert_msg_(DYNA_REC, !arg.IsSimpleReg() && !arg.IsImm(), "MOVBE: need r<-m or m<-r!"); + arg.WriteRex(this, bits, bits, reg); + Write8(0x0F); + Write8(0x38); + Write8(op); + arg.WriteRest(this, 0, reg); +} +void XEmitter::MOVBE(int bits, X64Reg dest, const OpArg& src) {WriteMOVBE(bits, 0xF0, dest, src);} +void XEmitter::MOVBE(int bits, const OpArg& dest, X64Reg src) {WriteMOVBE(bits, 0xF1, src, dest);} - if (dest.IsSimpleReg()) +void XEmitter::LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src) +{ + if (cpu_info.bMOVBE) { - _assert_msg_(DYNA_REC, !src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem"); - src.WriteRex(this, bits, bits, dest.GetSimpleReg()); - Write8(0x0F); Write8(0x38); Write8(0xF0); - src.WriteRest(this, 0, dest.GetSimpleReg()); - } - else if (src.IsSimpleReg()) - { - _assert_msg_(DYNA_REC, !dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem"); - dest.WriteRex(this, bits, bits, src.GetSimpleReg()); - Write8(0x0F); Write8(0x38); Write8(0xF1); - dest.WriteRest(this, 0, src.GetSimpleReg()); + MOVBE(size, dst, src); } else { - _assert_msg_(DYNA_REC, 0, "MOVBE: Not loading or storing to mem"); + MOV(size, R(dst), src); + BSWAP(size, dst); + } +} + +void XEmitter::SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src) +{ + if (cpu_info.bMOVBE) + { + MOVBE(size, dst, src); + } + else + { + BSWAP(size, src); + MOV(size, dst, R(src)); } } diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 4443309004..48a1eae3b1 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -299,6 +299,7 @@ private: void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0); + void WriteMOVBE(int bits, u8 op, X64Reg regOp, OpArg arg); void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2); @@ -476,7 +477,10 @@ public: void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); // Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE. - void MOVBE(int dbits, const OpArg& dest, const OpArg& src); + void MOVBE(int bits, X64Reg dest, const OpArg& src); + void MOVBE(int bits, const OpArg& dest, X64Reg src); + void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src); + void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); // Available only on AMD >= Phenom or Intel >= Haswell void LZCNT(int bits, X64Reg dest, OpArg src); diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp index 1209e2bd46..b8f2809fec 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.cpp @@ -23,32 +23,6 @@ void EmuCodeBlock::MemoryExceptionCheck() } } -void EmuCodeBlock::LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src) -{ - if (cpu_info.bMOVBE) - { - MOVBE(size, R(dst), src); - } - else - { - MOV(size, R(dst), src); - BSWAP(size, dst); - } -} - -void EmuCodeBlock::SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src) -{ - if (cpu_info.bMOVBE) - { - MOVBE(size, dst, R(src)); - } - else - { - BSWAP(size, src); - MOV(size, dst, R(src)); - } -} - void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend) { MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); @@ -451,7 +425,7 @@ u8 *EmuCodeBlock::UnsafeWriteRegToReg(OpArg reg_value, X64Reg reg_addr, int acce { if (cpu_info.bMOVBE) { - MOVBE(accessSize, dest, reg_value); + MOVBE(accessSize, dest, reg_value.GetSimpleReg()); } else { diff --git a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h index 9c11937b47..c3175633ba 100644 --- a/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h +++ b/Source/Core/Core/PowerPC/JitCommon/Jit_Util.h @@ -68,9 +68,6 @@ public: SetCodePtr(nearcode); } - void LoadAndSwap(int size, Gen::X64Reg dst, const Gen::OpArg& src); - void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); - Gen::FixupBranch CheckIfSafeAddress(Gen::OpArg reg_value, Gen::X64Reg reg_addr, BitSet32 registers_in_use, u32 mem_mask); void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false); diff --git a/Source/Core/VideoBackends/D3D/VertexManager.cpp b/Source/Core/VideoBackends/D3D/VertexManager.cpp index 5a15ca2a0b..5bd39d45b2 100644 --- a/Source/Core/VideoBackends/D3D/VertexManager.cpp +++ b/Source/Core/VideoBackends/D3D/VertexManager.cpp @@ -117,7 +117,6 @@ void VertexManager::PrepareDrawBuffers(u32 stride) void VertexManager::Draw(u32 stride) { - u32 components = VertexLoaderManager::GetCurrentVertexFormat()->m_components; u32 indices = IndexGenerator::GetIndexLen(); D3D::stateman->SetVertexBuffer(m_buffers[m_currentBuffer], stride, 0); diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index 2101a719c2..ceb83564f1 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -45,7 +45,7 @@ set(SRCS BoundingBox.cpp set(LIBS core png) if(_M_X86) - set(SRCS ${SRCS} TextureDecoder_x64.cpp) + set(SRCS ${SRCS} TextureDecoder_x64.cpp VertexLoaderX64.cpp) else() set(SRCS ${SRCS} TextureDecoder_Generic.cpp) endif() diff --git a/Source/Core/VideoCommon/CPMemory.h b/Source/Core/VideoCommon/CPMemory.h index 124c30a04a..51e1ccc74f 100644 --- a/Source/Core/VideoCommon/CPMemory.h +++ b/Source/Core/VideoCommon/CPMemory.h @@ -20,10 +20,12 @@ enum // Vertex components enum { - NOT_PRESENT = 0, - DIRECT = 1, - INDEX8 = 2, - INDEX16 = 3, + NOT_PRESENT = 0, + DIRECT = 1, + INDEX8 = 2, + INDEX16 = 3, + + MASK_INDEXED = 2, }; enum diff --git a/Source/Core/VideoCommon/DataReader.h b/Source/Core/VideoCommon/DataReader.h index f7e030739f..b27d513781 100644 --- a/Source/Core/VideoCommon/DataReader.h +++ b/Source/Core/VideoCommon/DataReader.h @@ -15,9 +15,9 @@ public: __forceinline DataReader(u8* src, u8* _end) : buffer(src), end(_end) {} - __forceinline void WritePointer(u8** src) + __forceinline u8* GetPointer() { - *src = buffer; + return buffer; } __forceinline u8* operator=(u8* src) diff --git a/Source/Core/VideoCommon/OpcodeDecoding.cpp b/Source/Core/VideoCommon/OpcodeDecoding.cpp index 34b821605f..0c1083b9c2 100644 --- a/Source/Core/VideoCommon/OpcodeDecoding.cpp +++ b/Source/Core/VideoCommon/OpcodeDecoding.cpp @@ -137,7 +137,7 @@ u8* OpcodeDecoder_Run(DataReader src, u32* cycles, bool in_display_list) u8* opcodeStart; while (true) { - src.WritePointer(&opcodeStart); + opcodeStart = src.GetPointer(); if (!src.size()) goto end; @@ -301,7 +301,7 @@ u8* OpcodeDecoder_Run(DataReader src, u32* cycles, bool in_display_list) if (!is_preprocess && g_bRecordFifoData && cmd_byte != GX_CMD_CALL_DL) { u8* opcodeEnd; - src.WritePointer(&opcodeEnd); + opcodeEnd = src.GetPointer(); FifoRecorder::GetInstance().WriteGPCommand(opcodeStart, u32(opcodeEnd - opcodeStart)); } } diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index e2a2073ccf..734ea9a4a6 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -4,8 +4,6 @@ #include "Common/CommonTypes.h" #include "Common/MemoryUtil.h" -#include "Common/x64ABI.h" -#include "Common/x64Emitter.h" #include "Core/Host.h" @@ -21,9 +19,6 @@ #include "VideoCommon/VideoCommon.h" #include "VideoCommon/VideoConfig.h" - -#define COMPILED_CODE_SIZE 4096 - #ifndef _WIN32 #undef inline #define inline @@ -33,9 +28,6 @@ u8* g_video_buffer_read_ptr; u8* g_vertex_manager_write_ptr; -using namespace Gen; - - void* VertexLoader::operator new (size_t size) { return AllocateAlignedMemory(size, 16); @@ -48,14 +40,9 @@ void VertexLoader::operator delete (void *p) static void LOADERDECL PosMtx_ReadDirect_UByte(VertexLoader* loader) { - BoundingBox::posMtxIdx = loader->m_curposmtx = DataReadU8() & 0x3f; - PRIM_LOG("posmtx: %d, ", loader->m_curposmtx); -} - -static void LOADERDECL PosMtx_Write(VertexLoader* loader) -{ - // u8, 0, 0, 0 - DataWrite(loader->m_curposmtx); + u8 posmtx = BoundingBox::posMtxIdx = DataReadU8() & 0x3f; + DataWrite(posmtx); + PRIM_LOG("posmtx: %d, ", posmtx); } static void LOADERDECL TexMtx_ReadDirect_UByte(VertexLoader* loader) @@ -77,18 +64,16 @@ static void LOADERDECL TexMtx_Write_Float2(VertexLoader* loader) DataWrite(float(loader->m_curtexmtx[loader->m_texmtxwrite++])); } -static void LOADERDECL TexMtx_Write_Float4(VertexLoader* loader) +static void LOADERDECL TexMtx_Write_Float3(VertexLoader* loader) { #if _M_SSE >= 0x200 __m128 output = _mm_cvtsi32_ss(_mm_castsi128_ps(_mm_setzero_si128()), loader->m_curtexmtx[loader->m_texmtxwrite++]); _mm_storeu_ps((float*)g_vertex_manager_write_ptr, _mm_shuffle_ps(output, output, 0x45 /* 1, 1, 0, 1 */)); - g_vertex_manager_write_ptr += sizeof(float) * 4; + g_vertex_manager_write_ptr += sizeof(float) * 3; #else DataWrite(0.f); DataWrite(0.f); DataWrite(float(loader->m_curtexmtx[loader->m_texmtxwrite++])); - // Just to fill out with 0. - DataWrite(0.f); #endif } @@ -106,19 +91,11 @@ static void LOADERDECL SkipVertex(VertexLoader* loader) VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) : VertexLoaderBase(vtx_desc, vtx_attr) { - m_compiledCode = nullptr; VertexLoader_Normal::Init(); VertexLoader_Position::Init(); VertexLoader_TextCoord::Init(); - #ifdef USE_VERTEX_LOADER_JIT - AllocCodeSpace(COMPILED_CODE_SIZE); CompileVertexTranslator(); - WriteProtect(); - #else - m_numPipelineStages = 0; - CompileVertexTranslator(); - #endif // generate frac factors m_posScale[0] = m_posScale[1] = m_posScale[2] = m_posScale[3] = 1.0f / (1U << m_VtxAttr.PosFrac); @@ -129,55 +106,13 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) m_colElements[i] = m_VtxAttr.color[i].Elements; } -VertexLoader::~VertexLoader() -{ - #ifdef USE_VERTEX_LOADER_JIT - FreeCodeSpace(); - #endif -} - void VertexLoader::CompileVertexTranslator() { m_VertexSize = 0; const TVtxAttr &vtx_attr = m_VtxAttr; -#ifdef USE_VERTEX_LOADER_JIT - if (m_compiledCode) - PanicAlert("Trying to recompile a vertex translator"); - - m_compiledCode = GetCodePtr(); - // We only use RAX (caller saved) and RBX (callee saved). - ABI_PushRegistersAndAdjustStack({RBX, RBP}, 8); - - // save count - MOV(64, R(RBX), R(ABI_PARAM1)); - - // save loader - MOV(64, R(RBP), R(ABI_PARAM2)); - - // Start loop here - const u8 *loop_start = GetCodePtr(); - - // Reset component counters if present in vertex format only. - if (m_VtxDesc.Tex0Coord || m_VtxDesc.Tex1Coord || m_VtxDesc.Tex2Coord || m_VtxDesc.Tex3Coord || - m_VtxDesc.Tex4Coord || m_VtxDesc.Tex5Coord || m_VtxDesc.Tex6Coord || m_VtxDesc.Tex7Coord) - { - WriteSetVariable(32, &m_tcIndex, Imm32(0)); - } - if (m_VtxDesc.Color0 || m_VtxDesc.Color1) - { - WriteSetVariable(32, &m_colIndex, Imm32(0)); - } - if (m_VtxDesc.Tex0MatIdx || m_VtxDesc.Tex1MatIdx || m_VtxDesc.Tex2MatIdx || m_VtxDesc.Tex3MatIdx || - m_VtxDesc.Tex4MatIdx || m_VtxDesc.Tex5MatIdx || m_VtxDesc.Tex6MatIdx || m_VtxDesc.Tex7MatIdx) - { - WriteSetVariable(32, &m_texmtxwrite, Imm32(0)); - WriteSetVariable(32, &m_texmtxread, Imm32(0)); - } -#else // Reset pipeline m_numPipelineStages = 0; -#endif // Get the pointer to this vertex's buffer data for the bounding box if (!g_ActiveConfig.backend_info.bSupportsBBox) @@ -195,16 +130,22 @@ void VertexLoader::CompileVertexTranslator() // Position in pc vertex format. int nat_offset = 0; - memset(&m_native_vtx_decl, 0, sizeof(m_native_vtx_decl)); // Position Matrix Index if (m_VtxDesc.PosMatIdx) { WriteCall(PosMtx_ReadDirect_UByte); components |= VB_HAS_POSMTXIDX; + m_native_vtx_decl.posmtx.components = 4; + m_native_vtx_decl.posmtx.enable = true; + m_native_vtx_decl.posmtx.offset = nat_offset; + m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.posmtx.integer = true; + nat_offset += 4; m_VertexSize += 1; } + if (m_VtxDesc.Tex0MatIdx) {m_VertexSize += 1; components |= VB_HAS_TEXMTXIDX0; WriteCall(TexMtx_ReadDirect_UByte); } if (m_VtxDesc.Tex1MatIdx) {m_VertexSize += 1; components |= VB_HAS_TEXMTXIDX1; WriteCall(TexMtx_ReadDirect_UByte); } if (m_VtxDesc.Tex2MatIdx) {m_VertexSize += 1; components |= VB_HAS_TEXMTXIDX2; WriteCall(TexMtx_ReadDirect_UByte); } @@ -325,11 +266,7 @@ void VertexLoader::CompileVertexTranslator() const int format = m_VtxAttr.texCoord[i].Format; const int elements = m_VtxAttr.texCoord[i].Elements; - if (tc[i] == NOT_PRESENT) - { - components &= ~(VB_HAS_UV0 << i); - } - else + if (tc[i] != NOT_PRESENT) { _assert_msg_(VIDEO, DIRECT <= tc[i] && tc[i] <= INDEX16, "Invalid texture coordinates!\n(tc[i] = %d)", (u32)tc[i]); _assert_msg_(VIDEO, FORMAT_UBYTE <= format && format <= FORMAT_FLOAT, "Invalid texture coordinates format!\n(format = %d)", format); @@ -353,9 +290,9 @@ void VertexLoader::CompileVertexTranslator() else { components |= VB_HAS_UV0 << i; // have to include since using now - m_native_vtx_decl.texcoords[i].components = 4; - nat_offset += 16; // still include the texture coordinate, but this time as 6 + 2 bytes - WriteCall(TexMtx_Write_Float4); + m_native_vtx_decl.texcoords[i].components = 3; + nat_offset += 12; + WriteCall(TexMtx_Write_Float3); } } else @@ -393,17 +330,6 @@ void VertexLoader::CompileVertexTranslator() if (!g_ActiveConfig.backend_info.bSupportsBBox) WriteCall(BoundingBox::Update); - if (m_VtxDesc.PosMatIdx) - { - WriteCall(PosMtx_Write); - m_native_vtx_decl.posmtx.components = 4; - m_native_vtx_decl.posmtx.enable = true; - m_native_vtx_decl.posmtx.offset = nat_offset; - m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE; - m_native_vtx_decl.posmtx.integer = true; - nat_offset += 4; - } - // indexed position formats may skip a the vertex if (m_VtxDesc.Position & 2) { @@ -412,49 +338,17 @@ void VertexLoader::CompileVertexTranslator() m_native_components = components; m_native_vtx_decl.stride = nat_offset; - -#ifdef USE_VERTEX_LOADER_JIT - // End loop here - SUB(64, R(RBX), Imm8(1)); - - J_CC(CC_NZ, loop_start); - ABI_PopRegistersAndAdjustStack({RBX, RBP}, 8); - RET(); -#endif } void VertexLoader::WriteCall(TPipelineFunction func) { -#ifdef USE_VERTEX_LOADER_JIT - MOV(64, R(ABI_PARAM1), R(RBP)); - ABI_CallFunction((const void*)func); -#else m_PipelineStages[m_numPipelineStages++] = func; -#endif } -// ARMTODO: This should be done in a better way -#ifndef _M_GENERIC -void VertexLoader::WriteGetVariable(int bits, OpArg dest, void *address) -{ -#ifdef USE_VERTEX_LOADER_JIT - MOV(64, R(RAX), Imm64((u64)address)); - MOV(bits, dest, MatR(RAX)); -#endif -} - -void VertexLoader::WriteSetVariable(int bits, void *address, OpArg value) -{ -#ifdef USE_VERTEX_LOADER_JIT - MOV(64, R(RAX), Imm64((u64)address)); - MOV(bits, MatR(RAX), value); -#endif -} -#endif int VertexLoader::RunVertices(int primitive, int count, DataReader src, DataReader dst) { - dst.WritePointer(&g_vertex_manager_write_ptr); - src.WritePointer(&g_video_buffer_read_ptr); + g_vertex_manager_write_ptr = dst.GetPointer(); + g_video_buffer_read_ptr = src.GetPointer(); m_numLoadedVertices += count; m_skippedVertices = 0; @@ -463,12 +357,6 @@ int VertexLoader::RunVertices(int primitive, int count, DataReader src, DataRead if (!g_ActiveConfig.backend_info.bSupportsBBox) BoundingBox::Prepare(m_vat, primitive, m_VtxDesc, m_native_vtx_decl); -#ifdef USE_VERTEX_LOADER_JIT - if (count > 0) - { - ((void (*)(int, VertexLoader* loader))(void*)m_compiledCode)(count, this); - } -#else for (int s = 0; s < count; s++) { m_tcIndex = 0; @@ -478,7 +366,6 @@ int VertexLoader::RunVertices(int primitive, int count, DataReader src, DataRead m_PipelineStages[i](this); PRIM_LOG("\n"); } -#endif return count - m_skippedVertices; } diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index 4c883a3242..a7acd5dba1 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -11,7 +11,6 @@ #include #include "Common/CommonTypes.h" -#include "Common/x64Emitter.h" #include "VideoCommon/CPMemory.h" #include "VideoCommon/DataReader.h" @@ -26,10 +25,6 @@ #include #endif -#ifdef _M_X86 -#define USE_VERTEX_LOADER_JIT -#endif - #ifdef WIN32 #define LOADERDECL __cdecl #else @@ -39,12 +34,7 @@ class VertexLoader; typedef void (LOADERDECL *TPipelineFunction)(VertexLoader* loader); -// ARMTODO: This should be done in a better way -#ifndef _M_GENERIC -class VertexLoader : public Gen::X64CodeBlock, public VertexLoaderBase -#else class VertexLoader : public VertexLoaderBase -#endif { public: // This class need a 16 byte alignment. As this is broken on @@ -53,7 +43,6 @@ public: void operator delete (void *p); VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr); - ~VertexLoader(); int RunVertices(int primitive, int count, DataReader src, DataReader dst) override; std::string GetName() const override { return "OldLoader"; } @@ -69,7 +58,6 @@ public: // Matrix components are first in GC format but later in PC format - we need to store it temporarily // when decoding each vertex. - u8 m_curposmtx; u8 m_curtexmtx[8]; int m_texmtxwrite; int m_texmtxread; @@ -77,22 +65,13 @@ public: int m_skippedVertices; private: -#ifndef USE_VERTEX_LOADER_JIT // Pipeline. TPipelineFunction m_PipelineStages[64]; // TODO - figure out real max. it's lower. int m_numPipelineStages; -#endif void CompileVertexTranslator(); void WriteCall(TPipelineFunction); - -#ifndef _M_GENERIC - void WriteGetVariable(int bits, Gen::OpArg dest, void *address); - void WriteSetVariable(int bits, void *address, Gen::OpArg dest); -#endif - - const u8 *m_compiledCode; }; #if _M_SSE >= 0x301 diff --git a/Source/Core/VideoCommon/VertexLoaderBase.cpp b/Source/Core/VideoCommon/VertexLoaderBase.cpp index ed56db0655..815e162689 100644 --- a/Source/Core/VideoCommon/VertexLoaderBase.cpp +++ b/Source/Core/VideoCommon/VertexLoaderBase.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 // Refer to the license.txt file included. +#include #include #include "Common/StringUtil.h" @@ -9,11 +10,17 @@ #include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoaderBase.h" +#ifdef _M_X86_64 +#include "VideoCommon/VertexLoaderX64.h" +#endif + VertexLoaderBase::VertexLoaderBase(const TVtxDesc &vtx_desc, const VAT &vtx_attr) { m_numLoadedVertices = 0; m_VertexSize = 0; m_native_vertex_format = nullptr; + m_native_components = 0; + memset(&m_native_vtx_decl, 0, sizeof(m_native_vtx_decl)); SetVAT(vtx_attr); m_VtxDesc = vtx_desc; @@ -131,15 +138,34 @@ class VertexLoaderTester : public VertexLoaderBase { public: VertexLoaderTester(VertexLoaderBase* _a, VertexLoaderBase* _b, const TVtxDesc& vtx_desc, const VAT& vtx_attr) - : VertexLoaderBase(vtx_desc, vtx_attr) + : VertexLoaderBase(vtx_desc, vtx_attr), a(_a), b(_b) { - a = _a; - b = _b; m_initialized = a && b && a->IsInitialized() && b->IsInitialized(); - m_initialized = m_initialized && (a->m_VertexSize == b->m_VertexSize); - m_initialized = m_initialized && (a->m_native_vtx_decl.stride == b->m_native_vtx_decl.stride); + bool can_test = a->m_VertexSize == b->m_VertexSize && + a->m_native_components == b->m_native_components && + a->m_native_vtx_decl.stride == b->m_native_vtx_decl.stride; + + if (m_initialized) + { + if (can_test) + { + m_VertexSize = a->m_VertexSize; + m_native_components = a->m_native_components; + memcpy(&m_native_vtx_decl, &a->m_native_vtx_decl, sizeof(PortableVertexDeclaration)); + } + else + { + ERROR_LOG(VIDEO, "Can't compare vertex loaders that expect different vertex formats!"); + ERROR_LOG(VIDEO, "a: m_VertexSize %d, m_native_components 0x%08x, stride %d\n", + a->m_VertexSize, a->m_native_components, a->m_native_vtx_decl.stride); + ERROR_LOG(VIDEO, "b: m_VertexSize %d, m_native_components 0x%08x, stride %d\n", + b->m_VertexSize, b->m_native_components, b->m_native_vtx_decl.stride); + } + } + + m_initialized &= can_test; } - ~VertexLoaderTester() + ~VertexLoaderTester() override { delete a; delete b; @@ -147,21 +173,22 @@ public: int RunVertices(int primitive, int count, DataReader src, DataReader dst) override { - buffer_a.resize(count * a->m_native_vtx_decl.stride); - buffer_b.resize(count * b->m_native_vtx_decl.stride); + buffer_a.resize(count * a->m_native_vtx_decl.stride + 4); + buffer_b.resize(count * b->m_native_vtx_decl.stride + 4); int count_a = a->RunVertices(primitive, count, src, DataReader(buffer_a.data(), buffer_a.data()+buffer_a.size())); int count_b = b->RunVertices(primitive, count, src, DataReader(buffer_b.data(), buffer_b.data()+buffer_b.size())); if (count_a != count_b) - ERROR_LOG(VIDEO, "Both vertexloaders have loaded a different amount of vertices."); + ERROR_LOG(VIDEO, "The two vertex loaders have loaded a different amount of vertices (a: %d, b: %d).", count_a, count_b); - if (memcmp(buffer_a.data(), buffer_b.data(), std::min(count_a, count_b))) - ERROR_LOG(VIDEO, "Both vertexloaders have loaded different data."); + if (memcmp(buffer_a.data(), buffer_b.data(), std::min(count_a, count_b) * m_native_vtx_decl.stride)) + ERROR_LOG(VIDEO, "The two vertex loaders have loaded different data " + "(guru meditation 0x%016" PRIx64 ", 0x%08x, 0x%08x, 0x%08x).", + m_VtxDesc.Hex, m_vat.g0.Hex, m_vat.g1.Hex, m_vat.g2.Hex); - u8* dstptr; - dst.WritePointer(&dstptr); - memcpy(dstptr, buffer_a.data(), count_a); + memcpy(dst.GetPointer(), buffer_a.data(), count_a * m_native_vtx_decl.stride); + m_numLoadedVertices += count; return count_a; } std::string GetName() const override { return "CompareLoader"; } @@ -177,15 +204,22 @@ VertexLoaderBase* VertexLoaderBase::CreateVertexLoader(const TVtxDesc& vtx_desc, { VertexLoaderBase* loader; -#if 0 +//#define COMPARE_VERTEXLOADERS + +#if defined(COMPARE_VERTEXLOADERS) && defined(_M_X86_64) // first try: Any new VertexLoader vs the old one loader = new VertexLoaderTester( new VertexLoader(vtx_desc, vtx_attr), // the software one - new VertexLoader(vtx_desc, vtx_attr), // the new one to compare + new VertexLoaderX64(vtx_desc, vtx_attr), // the new one to compare vtx_desc, vtx_attr); if (loader->IsInitialized()) return loader; delete loader; +#elif defined(_M_X86_64) + loader = new VertexLoaderX64(vtx_desc, vtx_attr); + if (loader->IsInitialized()) + return loader; + delete loader; #endif // last try: The old VertexLoader diff --git a/Source/Core/VideoCommon/VertexLoaderBase.h b/Source/Core/VideoCommon/VertexLoaderBase.h index 7da2d866f9..c5a11d2e63 100644 --- a/Source/Core/VideoCommon/VertexLoaderBase.h +++ b/Source/Core/VideoCommon/VertexLoaderBase.h @@ -72,7 +72,7 @@ class VertexLoaderBase { public: static VertexLoaderBase* CreateVertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr); - virtual ~VertexLoaderBase() {}; + virtual ~VertexLoaderBase() {} virtual int RunVertices(int primitive, int count, DataReader src, DataReader dst) = 0; diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index b95e286c1e..786cb45a96 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -121,11 +121,10 @@ static VertexLoaderBase* RefreshLoader(int vtx_attr_group, bool preprocess = fal { // search for a cached native vertex format const PortableVertexDeclaration& format = loader->m_native_vtx_decl; - auto& native = s_native_vertex_map[format]; + std::unique_ptr& native = s_native_vertex_map[format]; if (!native) { - auto raw_pointer = g_vertex_manager->CreateNativeVertexFormat(); - native = std::unique_ptr(raw_pointer); + native.reset(g_vertex_manager->CreateNativeVertexFormat()); native->Initialize(format); native->m_components = loader->m_native_components; } diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp new file mode 100644 index 0000000000..c9b435bae3 --- /dev/null +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -0,0 +1,456 @@ +#ifdef _MSC_VER +#include +#else +#include +#endif + +#include "Common/CPUDetect.h" +#include "Common/JitRegister.h" +#include "Common/x64ABI.h" +#include "VideoCommon/VertexLoaderX64.h" + +using namespace Gen; + +static const X64Reg src_reg = ABI_PARAM1; +static const X64Reg dst_reg = ABI_PARAM2; +static const X64Reg scratch1 = RAX; +static const X64Reg scratch2 = ABI_PARAM3; +static const X64Reg scratch3 = ABI_PARAM4; +static const X64Reg count_reg = R10; +static const X64Reg skipped_reg = R11; + +VertexLoaderX64::VertexLoaderX64(const TVtxDesc& vtx_desc, const VAT& vtx_att): VertexLoaderBase(vtx_desc, vtx_att) +{ + if (!IsInitialized()) + return; + + AllocCodeSpace(4096); + ClearCodeSpace(); + GenerateVertexLoader(); + WriteProtect(); + + std::string name; + AppendToString(&name); + JitRegister::Register(region, (u32)(GetCodePtr() - region), name.c_str()); +} + +OpArg VertexLoaderX64::GetVertexAddr(int array, u64 attribute) +{ + OpArg data = MDisp(src_reg, m_src_ofs); + if (attribute & MASK_INDEXED) + { + if (attribute == INDEX8) + { + MOVZX(64, 8, scratch1, data); + m_src_ofs += 1; + } + else + { + MOV(16, R(scratch1), data); + m_src_ofs += 2; + BSWAP(16, scratch1); + MOVZX(64, 16, scratch1, R(scratch1)); + } + if (array == ARRAY_POSITION) + { + CMP(attribute == INDEX8 ? 8 : 16, R(scratch1), Imm8(-1)); + m_skip_vertex = J_CC(CC_E, true); + } + // TODO: Move cached_arraybases into CPState and use MDisp() relative to a constant register loaded with &g_main_cp_state. + IMUL(32, scratch1, M(&g_main_cp_state.array_strides[array])); + MOV(64, R(scratch2), M(&cached_arraybases[array])); + return MRegSum(scratch1, scratch2); + } + else + { + return data; + } +} + +int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count_in, int count_out, u8 scaling_exponent, AttributeFormat* native_format) +{ + static const __m128i shuffle_lut[5][3] = { + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L), // 2x u8 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L)}, // 3x u8 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00FFFFFFL), // 1x s8 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL), // 2x s8 + _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL)}, // 3x s8 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0001L), // 1x u16 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L), // 2x u16 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L)}, // 3x u16 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x0001FFFFL), // 1x s16 + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL), // 2x s16 + _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL)}, // 3x s16 + {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x float + _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x float + _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x float + }; + static const __m128 scale_factors[32] = { + _mm_set_ps1(1./(1u<< 0)), _mm_set_ps1(1./(1u<< 1)), _mm_set_ps1(1./(1u<< 2)), _mm_set_ps1(1./(1u<< 3)), + _mm_set_ps1(1./(1u<< 4)), _mm_set_ps1(1./(1u<< 5)), _mm_set_ps1(1./(1u<< 6)), _mm_set_ps1(1./(1u<< 7)), + _mm_set_ps1(1./(1u<< 8)), _mm_set_ps1(1./(1u<< 9)), _mm_set_ps1(1./(1u<<10)), _mm_set_ps1(1./(1u<<11)), + _mm_set_ps1(1./(1u<<12)), _mm_set_ps1(1./(1u<<13)), _mm_set_ps1(1./(1u<<14)), _mm_set_ps1(1./(1u<<15)), + _mm_set_ps1(1./(1u<<16)), _mm_set_ps1(1./(1u<<17)), _mm_set_ps1(1./(1u<<18)), _mm_set_ps1(1./(1u<<19)), + _mm_set_ps1(1./(1u<<20)), _mm_set_ps1(1./(1u<<21)), _mm_set_ps1(1./(1u<<22)), _mm_set_ps1(1./(1u<<23)), + _mm_set_ps1(1./(1u<<24)), _mm_set_ps1(1./(1u<<25)), _mm_set_ps1(1./(1u<<26)), _mm_set_ps1(1./(1u<<27)), + _mm_set_ps1(1./(1u<<28)), _mm_set_ps1(1./(1u<<29)), _mm_set_ps1(1./(1u<<30)), _mm_set_ps1(1./(1u<<31)), + }; + + X64Reg coords = XMM0; + + int elem_size = 1 << (format / 2); + int load_bytes = elem_size * count_in; + if (load_bytes >= 8) + MOVDQU(coords, data); + else if (load_bytes >= 4) + MOVQ_xmm(coords, data); + else + MOVD_xmm(coords, data); + + PSHUFB(coords, M(&shuffle_lut[format][count_in - 1])); + + if (format != FORMAT_FLOAT) + { + // Sign extend + if (format == FORMAT_BYTE) + PSRAD(coords, 24); + if (format == FORMAT_SHORT) + PSRAD(coords, 16); + + CVTDQ2PS(coords, R(coords)); + + if (scaling_exponent) + MULPS(coords, M(&scale_factors[scaling_exponent])); + } + + OpArg dest = MDisp(dst_reg, m_dst_ofs); + switch (count_out) + { + case 1: MOVSS(dest, coords); break; + case 2: MOVLPS(dest, coords); break; + case 3: MOVUPS(dest, coords); break; + } + + native_format->components = count_out; + native_format->enable = true; + native_format->offset = m_dst_ofs; + native_format->type = VAR_FLOAT; + native_format->integer = false; + m_dst_ofs += sizeof(float) * count_out; + + if (attribute == DIRECT) + m_src_ofs += load_bytes; + + return load_bytes; +} + +void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format, int elements) +{ + int load_bytes = 0; + switch (format) + { + case FORMAT_24B_888: + case FORMAT_32B_888x: + case FORMAT_32B_8888: + MOV(32, R(scratch1), data); + // See VertexLoader_Color.cpp for a comment on this condition. + if (format != FORMAT_32B_8888 || !elements) + OR(32, R(scratch1), Imm32(0xFF000000)); + MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1)); + load_bytes = 3 + (format != FORMAT_24B_888); + break; + + case FORMAT_16B_565: + // RRRRRGGG GGGBBBBB + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + LoadAndSwap(16, scratch1, data); + if (cpu_info.bBMI1 && cpu_info.bBMI2) + { + MOV(32, R(scratch2), Imm32(0x07C3F7C0)); + PDEP(32, scratch3, scratch1, R(scratch2)); + + MOV(32, R(scratch2), Imm32(0xF8FCF800)); + PDEP(32, scratch1, scratch1, R(scratch2)); + ANDN(32, scratch2, scratch2, R(scratch3)); + + OR(32, R(scratch1), R(scratch2)); + } + else + { + MOV(32, R(scratch3), R(scratch1)); + SHL(32, R(scratch1), Imm8(16)); + AND(32, R(scratch1), Imm32(0xF8000000)); + + MOV(32, R(scratch2), R(scratch3)); + SHL(32, R(scratch2), Imm8(13)); + AND(32, R(scratch2), Imm32(0x00FC0000)); + OR(32, R(scratch1), R(scratch2)); + + SHL(32, R(scratch3), Imm8(11)); + AND(32, R(scratch3), Imm32(0x0000F800)); + OR(32, R(scratch1), R(scratch3)); + + MOV(32, R(scratch2), R(scratch1)); + SHR(32, R(scratch1), Imm8(5)); + AND(32, R(scratch1), Imm32(0x07000700)); + OR(32, R(scratch1), R(scratch2)); + + SHR(32, R(scratch2), Imm8(6)); + AND(32, R(scratch2), Imm32(0x00030000)); + OR(32, R(scratch1), R(scratch2)); + } + + OR(32, R(scratch1), Imm32(0x000000FF)); + SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1); + load_bytes = 2; + break; + + case FORMAT_16B_4444: + // RRRRGGGG BBBBAAAA + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + LoadAndSwap(16, scratch1, data); + if (cpu_info.bBMI2) + { + MOV(32, R(scratch3), Imm32(0x0F0F0F0F)); + PDEP(32, scratch2, scratch1, R(scratch3)); + MOV(32, R(scratch3), Imm32(0xF0F0F0F0)); + PDEP(32, scratch1, scratch1, R(scratch3)); + } + else + { + MOV(32, R(scratch3), R(scratch1)); + SHL(32, R(scratch1), Imm8(12)); + AND(32, R(scratch1), Imm32(0x0F000000)); + MOV(32, R(scratch2), R(scratch1)); + + MOV(32, R(scratch1), R(scratch3)); + SHL(32, R(scratch1), Imm8(8)); + AND(32, R(scratch1), Imm32(0x000F0000)); + OR(32, R(scratch2), R(scratch1)); + + MOV(32, R(scratch1), R(scratch3)); + SHL(32, R(scratch1), Imm8(4)); + AND(32, R(scratch1), Imm32(0x00000F00)); + OR(32, R(scratch2), R(scratch1)); + + AND(32, R(scratch3), Imm8(0x0F)); + OR(32, R(scratch2), R(scratch3)); + + MOV(32, R(scratch1), R(scratch2)); + SHL(32, R(scratch1), Imm8(4)); + } + OR(32, R(scratch1), R(scratch2)); + SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1); + load_bytes = 2; + break; + + case FORMAT_24B_6666: + // RRRRRRGG GGGGBBBB BBAAAAAA + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + data.offset -= 1; + LoadAndSwap(32, scratch1, data); + if (cpu_info.bBMI2) + { + MOV(32, R(scratch2), Imm32(0xFCFCFCFC)); + PDEP(32, scratch1, scratch1, R(scratch2)); + MOV(32, R(scratch2), R(scratch1)); + } + else + { + MOV(32, R(scratch3), R(scratch1)); + SHL(32, R(scratch1), Imm8(8)); + AND(32, R(scratch1), Imm32(0xFC000000)); + MOV(32, R(scratch2), R(scratch1)); + + MOV(32, R(scratch1), R(scratch3)); + SHL(32, R(scratch1), Imm8(6)); + AND(32, R(scratch1), Imm32(0x00FC0000)); + OR(32, R(scratch2), R(scratch1)); + + MOV(32, R(scratch1), R(scratch3)); + SHL(32, R(scratch1), Imm8(4)); + AND(32, R(scratch1), Imm32(0x0000FC00)); + OR(32, R(scratch2), R(scratch1)); + + SHL(32, R(scratch3), Imm8(2)); + AND(32, R(scratch3), Imm32(0x000000FC)); + OR(32, R(scratch2), R(scratch3)); + + MOV(32, R(scratch1), R(scratch2)); + } + + SHR(32, R(scratch1), Imm8(6)); + AND(32, R(scratch1), Imm32(0x03030303)); + OR(32, R(scratch1), R(scratch2)); + + SwapAndStore(32, MDisp(dst_reg, m_dst_ofs), scratch1); + load_bytes = 3; + break; + } + if (attribute == DIRECT) + m_src_ofs += load_bytes; +} + +void VertexLoaderX64::GenerateVertexLoader() +{ + // Backup count since we're going to count it down. + PUSH(32, R(ABI_PARAM3)); + + // We use ABI_PARAM3 for scratch2. + MOV(32, R(count_reg), R(ABI_PARAM3)); + + if (m_VtxDesc.Position & MASK_INDEXED) + XOR(32, R(skipped_reg), R(skipped_reg)); + + // TODO: load constants into registers outside the main loop + + const u8* loop_start = GetCodePtr(); + + if (m_VtxDesc.PosMatIdx) + { + MOVZX(32, 8, scratch1, MDisp(src_reg, m_src_ofs)); + AND(32, R(scratch1), Imm8(0x3F)); + MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1)); + m_native_components |= VB_HAS_POSMTXIDX; + m_native_vtx_decl.posmtx.components = 4; + m_native_vtx_decl.posmtx.enable = true; + m_native_vtx_decl.posmtx.offset = m_dst_ofs; + m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.posmtx.integer = true; + m_src_ofs += sizeof(u8); + m_dst_ofs += sizeof(u32); + } + + u32 texmatidx_ofs[8]; + const u64 tm[8] = { + m_VtxDesc.Tex0MatIdx, m_VtxDesc.Tex1MatIdx, m_VtxDesc.Tex2MatIdx, m_VtxDesc.Tex3MatIdx, + m_VtxDesc.Tex4MatIdx, m_VtxDesc.Tex5MatIdx, m_VtxDesc.Tex6MatIdx, m_VtxDesc.Tex7MatIdx, + }; + for (int i = 0; i < 8; i++) + { + if (tm[i]) + texmatidx_ofs[i] = m_src_ofs++; + } + + OpArg data = GetVertexAddr(ARRAY_POSITION, m_VtxDesc.Position); + ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, m_VtxAttr.PosFrac, &m_native_vtx_decl.position); + + if (m_VtxDesc.Normal) + { + static const u8 map[8] = {7, 6, 15, 14}; + u8 scaling_exponent = map[m_VtxAttr.NormalFormat]; + + for (int i = 0; i < (m_VtxAttr.NormalElements ? 3 : 1); i++) + { + if (!i || m_VtxAttr.NormalIndex3) + { + data = GetVertexAddr(ARRAY_NORMAL, m_VtxDesc.Normal); + int elem_size = 1 << (m_VtxAttr.NormalFormat / 2); + data.offset += i * elem_size * 3; + } + data.offset += ReadVertex(data, m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3, scaling_exponent, &m_native_vtx_decl.normals[i]); + } + + m_native_components |= VB_HAS_NRM0; + if (m_VtxAttr.NormalElements) + m_native_components |= VB_HAS_NRM1 | VB_HAS_NRM2; + } + + const u64 col[2] = {m_VtxDesc.Color0, m_VtxDesc.Color1}; + for (int i = 0; i < 2; i++) + { + if (col[i]) + { + data = GetVertexAddr(ARRAY_COLOR + i, col[i]); + ReadColor(data, col[i], m_VtxAttr.color[i].Comp, m_VtxAttr.color[i].Elements); + m_native_components |= VB_HAS_COL0 << i; + m_native_vtx_decl.colors[i].components = 4; + m_native_vtx_decl.colors[i].enable = true; + m_native_vtx_decl.colors[i].offset = m_dst_ofs; + m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.colors[i].integer = false; + m_dst_ofs += 4; + } + } + + const u64 tc[8] = { + m_VtxDesc.Tex0Coord, m_VtxDesc.Tex1Coord, m_VtxDesc.Tex2Coord, m_VtxDesc.Tex3Coord, + m_VtxDesc.Tex4Coord, m_VtxDesc.Tex5Coord, m_VtxDesc.Tex6Coord, m_VtxDesc.Tex7Coord, + }; + for (int i = 0; i < 8; i++) + { + int elements = m_VtxAttr.texCoord[i].Elements + 1; + if (tc[i]) + { + data = GetVertexAddr(ARRAY_TEXCOORD0 + i, tc[i]); + u8 scaling_exponent = m_VtxAttr.texCoord[i].Frac; + ReadVertex(data, tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements, scaling_exponent, &m_native_vtx_decl.texcoords[i]); + m_native_components |= VB_HAS_UV0 << i; + } + if (tm[i]) + { + m_native_components |= VB_HAS_TEXMTXIDX0 << i; + m_native_vtx_decl.texcoords[i].components = 3; + m_native_vtx_decl.texcoords[i].enable = true; + m_native_vtx_decl.texcoords[i].type = VAR_FLOAT; + m_native_vtx_decl.texcoords[i].integer = false; + MOVZX(64, 8, scratch1, MDisp(src_reg, texmatidx_ofs[i])); + if (tc[i]) + { + CVTSI2SS(XMM0, R(scratch1)); + MOVSS(MDisp(dst_reg, m_dst_ofs), XMM0); + m_dst_ofs += sizeof(float); + } + else + { + m_native_vtx_decl.texcoords[i].offset = m_dst_ofs; + PXOR(XMM0, R(XMM0)); + CVTSI2SS(XMM0, R(scratch1)); + SHUFPS(XMM0, R(XMM0), 0x45); + MOVUPS(MDisp(dst_reg, m_dst_ofs), XMM0); + m_dst_ofs += sizeof(float) * 3; + } + } + } + + // Prepare for the next vertex. + ADD(64, R(dst_reg), Imm32(m_dst_ofs)); + const u8* cont = GetCodePtr(); + ADD(64, R(src_reg), Imm32(m_src_ofs)); + + SUB(32, R(count_reg), Imm8(1)); + J_CC(CC_NZ, loop_start); + + // Get the original count. + POP(32, R(ABI_RETURN)); + + if (m_VtxDesc.Position & MASK_INDEXED) + { + SUB(32, R(ABI_RETURN), R(skipped_reg)); + RET(); + + SetJumpTarget(m_skip_vertex); + ADD(32, R(skipped_reg), Imm8(1)); + JMP(cont); + } + else + { + RET(); + } + + m_VertexSize = m_src_ofs; + m_native_vtx_decl.stride = m_dst_ofs; +} + +bool VertexLoaderX64::IsInitialized() +{ + // Uses PSHUFB. + return cpu_info.bSSSE3; +} + +int VertexLoaderX64::RunVertices(int primitive, int count, DataReader src, DataReader dst) +{ + m_numLoadedVertices += count; + return ((int (*)(u8* src, u8* dst, int count))region)(src.GetPointer(), dst.GetPointer(), count); +} diff --git a/Source/Core/VideoCommon/VertexLoaderX64.h b/Source/Core/VideoCommon/VertexLoaderX64.h new file mode 100644 index 0000000000..3cf9d7c8ba --- /dev/null +++ b/Source/Core/VideoCommon/VertexLoaderX64.h @@ -0,0 +1,22 @@ +#include "Common/x64Emitter.h" +#include "VideoCommon/VertexLoaderBase.h" + +class VertexLoaderX64 : public VertexLoaderBase, public Gen::X64CodeBlock +{ +public: + VertexLoaderX64(const TVtxDesc& vtx_desc, const VAT& vtx_att); + +protected: + std::string GetName() const override { return "VertexLoaderX64"; } + bool IsInitialized() override; + int RunVertices(int primitive, int count, DataReader src, DataReader dst) override; + +private: + u32 m_src_ofs = 0; + u32 m_dst_ofs = 0; + Gen::FixupBranch m_skip_vertex; + Gen::OpArg GetVertexAddr(int array, u64 attribute); + int ReadVertex(Gen::OpArg data, u64 attribute, int format, int count_in, int count_out, u8 scaling_exponent, AttributeFormat* native_format); + void ReadColor(Gen::OpArg data, u64 attribute, int format, int elements); + void GenerateVertexLoader(); +}; diff --git a/Source/Core/VideoCommon/VertexLoader_Color.cpp b/Source/Core/VideoCommon/VertexLoader_Color.cpp index 58b1b96f31..5f2a8106c7 100644 --- a/Source/Core/VideoCommon/VertexLoader_Color.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Color.cpp @@ -10,10 +10,6 @@ #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VideoCommon.h" -#define RSHIFT 0 -#define GSHIFT 8 -#define BSHIFT 16 -#define ASHIFT 24 #define AMASK 0xFF000000 __forceinline void _SetCol(VertexLoader* loader, u32 val) @@ -24,13 +20,14 @@ __forceinline void _SetCol(VertexLoader* loader, u32 val) //color comes in format BARG in 16 bits //BARG -> AABBGGRR -__forceinline void _SetCol4444(VertexLoader* loader, u16 val) +__forceinline void _SetCol4444(VertexLoader* loader, u16 val_) { - u32 col = (val & 0xF0); // col = 000000R0; - col |= (val & 0xF ) << 12; // col |= 0000G000; - col |= (((u32)val) & 0xF000) << 8; // col |= 00B00000; - col |= (((u32)val) & 0x0F00) << 20; // col |= A0000000; - col |= col >> 4; // col = A0B0G0R0 | 0A0B0G0R; + u32 col, val = val_; + col = val & 0x00F0; // col = 000000R0; + col |= (val & 0x000F) << 12; // col |= 0000G000; + col |= (val & 0xF000) << 8; // col |= 00B00000; + col |= (val & 0x0F00) << 20; // col |= A0000000; + col |= col >> 4; // col = A0B0G0R0 | 0A0B0G0R; _SetCol(loader, col); } @@ -38,9 +35,9 @@ __forceinline void _SetCol4444(VertexLoader* loader, u16 val) //RRRRRRGG GGGGBBBB BBAAAAAA __forceinline void _SetCol6666(VertexLoader* loader, u32 val) { - u32 col = (val >> 16) & 0xFC; - col |= (val >> 2) & 0xFC00; - col |= (val << 12) & 0xFC0000; + u32 col = (val >> 16) & 0x000000FC; + col |= (val >> 2) & 0x0000FC00; + col |= (val << 12) & 0x00FC0000; col |= (val << 26) & 0xFC000000; col |= (col >> 6) & 0x03030303; _SetCol(loader, col); @@ -48,13 +45,14 @@ __forceinline void _SetCol6666(VertexLoader* loader, u32 val) //color comes in RGB //RRRRRGGG GGGBBBBB -__forceinline void _SetCol565(VertexLoader* loader, u16 val) +__forceinline void _SetCol565(VertexLoader* loader, u16 val_) { - u32 col = (val >> 8) & 0xF8; - col |= (val << 5) & 0xFC00; - col |=(((u32)val) << 19) & 0xF80000; - col |= (col >> 5) & 0x070007; - col |= (col >> 6) & 0x000300; + u32 col, val = val_; + col = (val >> 8) & 0x0000F8; + col |= (val << 5) & 0x00FC00; + col |= (val << 19) & 0xF80000; + col |= (col >> 5) & 0x070007; + col |= (col >> 6) & 0x000300; _SetCol(loader, col | AMASK); } @@ -96,11 +94,6 @@ void LOADERDECL Color_ReadDirect_24b_6666(VertexLoader* loader) } // F|RES: i am not 100 percent sure, but the colElements seems to be important for rendering only // at least it fixes mario party 4 -// -// if (colElements[colIndex]) -// else -// col |= 0xFF<m_colElements[loader->m_colIndex]) - col |= 0xFF << ASHIFT; + col |= AMASK; _SetCol(loader, col); } diff --git a/Source/Core/VideoCommon/VertexLoader_Normal.cpp b/Source/Core/VideoCommon/VertexLoader_Normal.cpp index 70081ead11..0d794d9944 100644 --- a/Source/Core/VideoCommon/VertexLoader_Normal.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Normal.cpp @@ -50,7 +50,7 @@ __forceinline void ReadIndirect(const T* data) dst.Write(FracAdjust(Common::FromBigEndian(data[i]))); } - dst.WritePointer(&g_vertex_manager_write_ptr); + g_vertex_manager_write_ptr = dst.GetPointer(); LOG_NORM(); } @@ -110,7 +110,7 @@ struct Normal_Direct_SSSE3 { const T* pData = reinterpret_cast(DataGetPosition()); const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed::value - 1)); - const __m128 scale = _mm_set_ps(frac, frac, frac, frac); + const __m128 scale = _mm_set_ps1(frac); for (int i = 0; i < N; i++, pData += 3) Vertex_Read_SSSE3(pData, scale); DataSkip(); @@ -128,7 +128,7 @@ __forceinline void Normal_Index_Offset_SSSE3() const T* pData = (const T*)(cached_arraybases[ARRAY_NORMAL] + (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed::value - 1)); - const __m128 scale = _mm_set_ps(frac, frac, frac, frac); + const __m128 scale = _mm_set_ps1(frac); for (int i = 0; i < N; i++, pData += 3) Vertex_Read_SSSE3(pData, scale); } diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index 419c041b5b..3313c12b5a 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -81,8 +81,8 @@ void LOADERDECL Pos_ReadDirect(VertexLoader* loader) for (int i = 0; i < 3; ++i) dst.Write(i(), scale) : 0.f); - dst.WritePointer(&g_vertex_manager_write_ptr); - src.WritePointer(&g_video_buffer_read_ptr); + g_vertex_manager_write_ptr = dst.GetPointer(); + g_video_buffer_read_ptr = src.GetPointer(); LOG_VTX(); } @@ -101,7 +101,7 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader) for (int i = 0; i < 3; ++i) dst.Write(i(), scale)); - dst.WritePointer(&g_vertex_manager_write_ptr); - src.WritePointer(&g_video_buffer_read_ptr); + g_vertex_manager_write_ptr = dst.GetPointer(); + g_video_buffer_read_ptr = src.GetPointer(); LOG_TEX(); ++loader->m_tcIndex; @@ -77,7 +77,7 @@ void LOADERDECL TexCoord_ReadIndex(VertexLoader* loader) for (int i = 0; i != N; ++i) dst.Write(TCScale(Common::FromBigEndian(data[i]), scale)); - dst.WritePointer(&g_vertex_manager_write_ptr); + g_vertex_manager_write_ptr = dst.GetPointer(); LOG_TEX(); ++loader->m_tcIndex; } @@ -166,14 +166,14 @@ void VertexLoader_TextCoord::Init() tableReadTexCoord[1][3][1] = TexCoord_ReadDirect2_SSSE3; tableReadTexCoord[1][4][1] = TexCoord_ReadDirect2_SSSE3; tableReadTexCoord[2][0][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][0][1] = TexCoord_ReadIndex2_SSSE3; tableReadTexCoord[2][1][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][1][1] = TexCoord_ReadIndex2_SSSE3; tableReadTexCoord[2][2][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][2][1] = TexCoord_ReadIndex2_SSSE3; tableReadTexCoord[2][3][1] = TexCoord_ReadIndex2_SSSE3; - tableReadTexCoord[3][3][1] = TexCoord_ReadIndex2_SSSE3; tableReadTexCoord[2][4][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][0][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][1][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][2][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][3][1] = TexCoord_ReadIndex2_SSSE3; tableReadTexCoord[3][4][1] = TexCoord_ReadIndex2_SSSE3; } #endif diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj b/Source/Core/VideoCommon/VideoCommon.vcxproj index b4db374e28..f76c59e8ba 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj @@ -66,6 +66,7 @@ + diff --git a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters index d00ee5518c..eca9287439 100644 --- a/Source/Core/VideoCommon/VideoCommon.vcxproj.filters +++ b/Source/Core/VideoCommon/VideoCommon.vcxproj.filters @@ -122,6 +122,9 @@ Vertex Loading + + Vertex Loading + Vertex Loading diff --git a/Source/UnitTests/Common/x64EmitterTest.cpp b/Source/UnitTests/Common/x64EmitterTest.cpp index 6b875752f8..977b19a9bc 100644 --- a/Source/UnitTests/Common/x64EmitterTest.cpp +++ b/Source/UnitTests/Common/x64EmitterTest.cpp @@ -609,12 +609,12 @@ TEST_F(x64EmitterTest, MOVZX) TEST_F(x64EmitterTest, MOVBE) { - emitter->MOVBE(16, R(RAX), MatR(R12)); - emitter->MOVBE(16, MatR(RAX), R(R12)); - emitter->MOVBE(32, R(RAX), MatR(R12)); - emitter->MOVBE(32, MatR(RAX), R(R12)); - emitter->MOVBE(64, R(RAX), MatR(R12)); - emitter->MOVBE(64, MatR(RAX), R(R12)); + emitter->MOVBE(16, RAX, MatR(R12)); + emitter->MOVBE(16, MatR(RAX), R12); + emitter->MOVBE(32, RAX, MatR(R12)); + emitter->MOVBE(32, MatR(RAX), R12); + emitter->MOVBE(64, RAX, MatR(R12)); + emitter->MOVBE(64, MatR(RAX), R12); ExpectDisassembly("movbe ax, word ptr ds:[r12] " "movbe word ptr ds:[rax], r12w " "movbe eax, dword ptr ds:[r12] "