dolphin/Source/Core/VideoCommon/VertexLoader.h

// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Refer to the license.txt file included.

#pragma once

// Top vertex loaders
// Metroid Prime: P I16-flt N I16-s16 T0 I16-u16 T1 i16-flt

#include <algorithm>
#include <string>

#include "Common/CommonTypes.h"
#include "Common/x64Emitter.h"

#include "VideoCommon/CPMemory.h"
#include "VideoCommon/DataReader.h"
#include "VideoCommon/NativeVertexFormat.h"
#include "VideoCommon/VertexLoaderBase.h"
#include "VideoCommon/VertexLoaderUtils.h"

#if _M_SSE >= 0x401
#include <smmintrin.h>
#include <emmintrin.h>
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
#include <tmmintrin.h>
#endif

#ifdef _M_X86
#define USE_VERTEX_LOADER_JIT
#endif

#ifdef WIN32
#define LOADERDECL __cdecl
#else
#define LOADERDECL
#endif

class VertexLoader;
typedef void (LOADERDECL *TPipelineFunction)(VertexLoader* loader);

// ARMTODO: This should be done in a better way
#ifndef _M_GENERIC
class VertexLoader : public Gen::X64CodeBlock, public VertexLoaderBase
#else
class VertexLoader : public VertexLoaderBase
#endif
{
public:
	// This class need a 16 byte alignment. As this is broken on
	// MSVC right now (Dec 2014), we use custom allocation.
	void* operator new (size_t size);
	void operator delete (void *p);

	VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);
	~VertexLoader();

	int RunVertices(int primitive, int count, DataReader src, DataReader dst) override;
	std::string GetName() const override { return "OldLoader"; }
	bool IsInitialized() override { return true; } // This vertex loader supports all formats

	// They are used for the communication with the loader functions
	// Duplicated (4x and 2x respectively) and used in SSE code in the vertex loader JIT
	GC_ALIGNED128(float m_posScale[4]);
	GC_ALIGNED64(float m_tcScale[8][2]);
	int m_tcIndex;
	int m_colIndex;
	int m_colElements[2];

	// Matrix components are first in GC format but later in PC format - we need to store it temporarily
	// when decoding each vertex.
	u8 m_curposmtx;
	u8 m_curtexmtx[8];
	int m_texmtxwrite;
	int m_texmtxread;
	bool m_vertexSkip;
	int m_skippedVertices;

private:
#ifndef USE_VERTEX_LOADER_JIT
	// Pipeline.
	TPipelineFunction m_PipelineStages[64];  // TODO - figure out real max. it's lower.
	int m_numPipelineStages;
#endif

	void CompileVertexTranslator();

	void WriteCall(TPipelineFunction);

#ifndef _M_GENERIC
	void WriteGetVariable(int bits, Gen::OpArg dest, void *address);
	void WriteSetVariable(int bits, void *address, Gen::OpArg dest);
#endif

	const u8 *m_compiledCode;
};

#if _M_SSE >= 0x301
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
static const __m128i kMaskSwap16to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L);
static const __m128i kMaskSwap16to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L);
static const __m128i kMaskSwap16to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL);
static const __m128i kMaskSwap16to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL);
static const __m128i kMask8to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L);
static const __m128i kMask8to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L);
static const __m128i kMask8to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);
static const __m128i kMask8to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);

template <typename T, bool threeIn, bool threeOut>
__forceinline void Vertex_Read_SSSE3(const T* pData, __m128 scale)
{
	__m128i coords, mask;

	int loadBytes = sizeof(T) * (2 + threeIn);
	if (loadBytes > 8)
		coords = _mm_loadu_si128((__m128i*)pData);
	else if (loadBytes > 4)
		coords = _mm_loadl_epi64((__m128i*)pData);
	else
		coords = _mm_cvtsi32_si128(*(u32*)pData);

	// Float case (no scaling)
	if (sizeof(T) == 4)
	{
		coords = _mm_shuffle_epi8(coords, threeIn ? kMaskSwap32_3 : kMaskSwap32_2);
		if (threeOut)
			_mm_storeu_si128((__m128i*)g_vertex_manager_write_ptr, coords);
		else
			_mm_storel_epi64((__m128i*)g_vertex_manager_write_ptr, coords);
	}
	else
	{
		// Byte swap, unpack, and move to high bytes for sign extend.
		if (std::is_unsigned<T>::value)
			mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32l_3 : kMaskSwap16to32l_2) : (threeIn ? kMask8to32l_3 : kMask8to32l_2);
		else
			mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32h_3 : kMaskSwap16to32h_2) : (threeIn ? kMask8to32h_3 : kMask8to32h_2);
		coords = _mm_shuffle_epi8(coords, mask);

		// Sign extend
		if (std::is_signed<T>::value)
			coords = _mm_srai_epi32(coords, 32 - sizeof(T) * 8);

		__m128 out = _mm_mul_ps(_mm_cvtepi32_ps(coords), scale);
		if (threeOut)
			_mm_storeu_ps((float*)g_vertex_manager_write_ptr, out);
		else
			_mm_storel_pi((__m64*)g_vertex_manager_write_ptr, out);
	}

	g_vertex_manager_write_ptr += sizeof(float) * (2 + threeOut);
}
#endif
New license header introduced for DiscIO, AudioCommon, InputCommon, VideoCommon, and Common projects. 2013-04-18 03:09:55 +00:00			`// Copyright 2013 Dolphin Emulator Project`
			`// Licensed under GPLv2`
			`// Refer to the license.txt file included.`
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00
Replace all include guard ifdefs with "#pragma once" 2014-02-10 18:54:46 +00:00			`#pragma once`
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00
VertexLoader: Change some pointer arithmetic to array syntax. should have no effect on performance. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@2255 8ced0084-cf51-0410-be5f-012b33b47a6e 2009-02-15 13:45:03 +00:00			`// Top vertex loaders`
			`// Metroid Prime: P I16-flt N I16-s16 T0 I16-u16 T1 i16-flt`

Rewrote a object-comparison code using a standard function. It's a tiny refactoring. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5160 8ced0084-cf51-0410-be5f-012b33b47a6e 2010-03-05 12:04:09 +00:00			`#include <algorithm>`
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00			`#include <string>`

Include CommonTypes.h instead of Common.h. 2014-09-08 01:06:58 +00:00			`#include "Common/CommonTypes.h"`
Convert all includes to relative paths. 2014-02-17 10:18:15 +00:00			`#include "Common/x64Emitter.h"`
OGL plugin: +Autoscale option (attempts to remove borders, even without XFB). Lots of cleanup, especially around aspect ratio and similar stuff. MP2 scanner still broken, wonder when that happened? git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@2470 8ced0084-cf51-0410-be5f-012b33b47a6e 2009-02-28 16:33:59 +00:00
Convert all includes to relative paths. 2014-02-17 10:18:15 +00:00			`#include "VideoCommon/CPMemory.h"`
			`#include "VideoCommon/DataReader.h"`
			`#include "VideoCommon/NativeVertexFormat.h"`
VideoCommon: split VertexLoaderBase from VertexLoader 2014-12-13 00:51:14 +00:00			`#include "VideoCommon/VertexLoaderBase.h"`
VertexLoader: Move the old Datareader function into VertexLoader 2014-11-29 02:39:24 +00:00			`#include "VideoCommon/VertexLoaderUtils.h"`
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00
Vertex Loader: SSE implementations of more position/texcoord/normal formats ~35-45% faster NFS:HP2, possibly other vertex-bound games. 2014-11-11 09:48:38 +00:00			`#if _M_SSE >= 0x401`
			`#include <smmintrin.h>`
			`#include <emmintrin.h>`
			`#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)`
			`#include <tmmintrin.h>`
			`#endif`

Fix the vertexloader on non-x86 targets. When I dropped ARM from a generic target, this caused the vertexloader to try using the JIT path. Instead of !_M_GENERIC, check for _M_X86 instead. Since it is only for the x86 target 2014-06-13 18:36:54 +00:00			`#ifdef _M_X86`
Fix various warnings reported by clang - mostly remove unused variables - rename some generic JIT identifiers 2014-02-23 14:14:27 +00:00			`#define USE_VERTEX_LOADER_JIT`
			`#endif`
Turn the X86 emitter into a class, so the code pointer is no longer a global, yay! Created XCodeBlock that derives from XEmitter, and the Jit now derives from XCodeBlock so it can call all ADD SUB JNZ etc without having to prefix them with "emit.". I think someone's gonna like this. There's some cleanup still to be done, but hey, it works. There shouldn't be a noticable speed difference. I hope GCC doesn't have a problem with the "member function pointers" I used. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1594 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-19 21:24:52 +00:00
VideoCommon: split VertexLoaderBase from VertexLoader 2014-12-13 00:51:14 +00:00			`#ifdef WIN32`
			`#define LOADERDECL __cdecl`
			`#else`
			`#define LOADERDECL`
			`#endif`

VertexLoader: Add a VertexLoader pointer to each function call 2014-12-13 09:57:46 +00:00			`class VertexLoader;`
			`typedef void (LOADERDECL TPipelineFunction)(VertexLoader loader);`
mark all local variables as static 2014-07-08 13:58:25 +00:00
ARM Support without GLSL 2013-02-26 19:49:00 +00:00			`// ARMTODO: This should be done in a better way`
			`#ifndef _M_GENERIC`
VideoCommon: split VertexLoaderBase from VertexLoader 2014-12-13 00:51:14 +00:00			`class VertexLoader : public Gen::X64CodeBlock, public VertexLoaderBase`
ARM Support without GLSL 2013-02-26 19:49:00 +00:00			`#else`
VideoCommon: split VertexLoaderBase from VertexLoader 2014-12-13 00:51:14 +00:00			`class VertexLoader : public VertexLoaderBase`
ARM Support without GLSL 2013-02-26 19:49:00 +00:00			`#endif`
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00			`{`
			`public:`
VertexLoader: Add a VertexLoader pointer to each function call 2014-12-13 09:57:46 +00:00			`// This class need a 16 byte alignment. As this is broken on`
			`// MSVC right now (Dec 2014), we use custom allocation.`
			`void* operator new (size_t size);`
			`void operator delete (void *p);`

set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00			`VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);`
			`~VertexLoader();`

VideoCommon: split VertexLoaderBase from VertexLoader 2014-12-13 00:51:14 +00:00			`int RunVertices(int primitive, int count, DataReader src, DataReader dst) override;`
			`std::string GetName() const override { return "OldLoader"; }`
			`bool IsInitialized() override { return true; } // This vertex loader supports all formats`
Some changes to VertexLoaderManager: - Lazily create the native vertex format (which involves GL calls) from RunVertices rather than RefreshLoader itself, freeing the latter to be run from the CPU thread (hopefully). - In order to avoid useless allocations while doing so, store the native format inside the VertexLoader rather than using a cache entry. - Wrap the s_vertex_loader_map in a lock, for similar reasons. 2014-08-25 03:53:28 +00:00
VertexLoader: Add a VertexLoader pointer to each function call 2014-12-13 09:57:46 +00:00			`// They are used for the communication with the loader functions`
			`// Duplicated (4x and 2x respectively) and used in SSE code in the vertex loader JIT`
			`GC_ALIGNED128(float m_posScale[4]);`
			`GC_ALIGNED64(float m_tcScale[8][2]);`
			`int m_tcIndex;`
			`int m_colIndex;`
			`int m_colElements[2];`

			`// Matrix components are first in GC format but later in PC format - we need to store it temporarily`
			`// when decoding each vertex.`
			`u8 m_curposmtx;`
			`u8 m_curtexmtx[8];`
			`int m_texmtxwrite;`
			`int m_texmtxread;`
VertexLoader: Skip vertices with position index = -1 2014-12-21 13:29:44 +00:00			`bool m_vertexSkip;`
			`int m_skippedVertices;`
VertexLoader: Add a VertexLoader pointer to each function call 2014-12-13 09:57:46 +00:00
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00			`private:`
Fix various warnings reported by clang - mostly remove unused variables - rename some generic JIT identifiers 2014-02-23 14:14:27 +00:00			`#ifndef USE_VERTEX_LOADER_JIT`
			`// Pipeline.`
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00			`TPipelineFunction m_PipelineStages[64]; // TODO - figure out real max. it's lower.`
			`int m_numPipelineStages;`
Fix various warnings reported by clang - mostly remove unused variables - rename some generic JIT identifiers 2014-02-23 14:14:27 +00:00			`#endif`
set svn:eol-style=native for **.h git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1438 8ced0084-cf51-0410-be5f-012b33b47a6e 2008-12-08 04:46:09 +00:00
			`void CompileVertexTranslator();`

			`void WriteCall(TPipelineFunction);`
Linux 64-bit fix by tinctorius, please verify! git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1845 8ced0084-cf51-0410-be5f-012b33b47a6e 2009-01-10 23:10:33 +00:00
ARM Support without GLSL 2013-02-26 19:49:00 +00:00			`#ifndef _M_GENERIC`
more info in the vertex loader debug display git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@2128 8ced0084-cf51-0410-be5f-012b33b47a6e 2009-02-06 19:52:23 +00:00			`void WriteGetVariable(int bits, Gen::OpArg dest, void *address);`
			`void WriteSetVariable(int bits, void *address, Gen::OpArg dest);`
ARM Support without GLSL 2013-02-26 19:49:00 +00:00			`#endif`
VideoCommon: split VertexLoaderBase from VertexLoader 2014-12-13 00:51:14 +00:00
			`const u8 *m_compiledCode;`
Clean up more space/tab mismatches in AudioCommon, Common, and VideoCommon. Not planning to touch Core since it's the most actively changed part of the project. 2013-03-20 01:51:12 +00:00			`};`
Vertex Loader: SSE implementations of more position/texcoord/normal formats ~35-45% faster NFS:HP2, possibly other vertex-bound games. 2014-11-11 09:48:38 +00:00
			`#if _M_SSE >= 0x301`
			`static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);`
			`static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);`
			`static const __m128i kMaskSwap16to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L);`
			`static const __m128i kMaskSwap16to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L);`
			`static const __m128i kMaskSwap16to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL);`
			`static const __m128i kMaskSwap16to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL);`
			`static const __m128i kMask8to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L);`
			`static const __m128i kMask8to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L);`
			`static const __m128i kMask8to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);`
			`static const __m128i kMask8to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);`

			`template <typename T, bool threeIn, bool threeOut>`
			`__forceinline void Vertex_Read_SSSE3(const T* pData, __m128 scale)`
			`{`
			`__m128i coords, mask;`

			`int loadBytes = sizeof(T) * (2 + threeIn);`
			`if (loadBytes > 8)`
			`coords = _mm_loadu_si128((__m128i*)pData);`
			`else if (loadBytes > 4)`
			`coords = _mm_loadl_epi64((__m128i*)pData);`
			`else`
			`coords = _mm_cvtsi32_si128((u32)pData);`

			`// Float case (no scaling)`
			`if (sizeof(T) == 4)`
			`{`
			`coords = _mm_shuffle_epi8(coords, threeIn ? kMaskSwap32_3 : kMaskSwap32_2);`
			`if (threeOut)`
VideoCommon: Rename s_pCurBufferPointer 2014-12-09 07:30:38 +00:00			`_mm_storeu_si128((__m128i*)g_vertex_manager_write_ptr, coords);`
Vertex Loader: SSE implementations of more position/texcoord/normal formats ~35-45% faster NFS:HP2, possibly other vertex-bound games. 2014-11-11 09:48:38 +00:00			`else`
VideoCommon: Rename s_pCurBufferPointer 2014-12-09 07:30:38 +00:00			`_mm_storel_epi64((__m128i*)g_vertex_manager_write_ptr, coords);`
Vertex Loader: SSE implementations of more position/texcoord/normal formats ~35-45% faster NFS:HP2, possibly other vertex-bound games. 2014-11-11 09:48:38 +00:00			`}`
			`else`
			`{`
			`// Byte swap, unpack, and move to high bytes for sign extend.`
			`if (std::is_unsigned<T>::value)`
			`mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32l_3 : kMaskSwap16to32l_2) : (threeIn ? kMask8to32l_3 : kMask8to32l_2);`
			`else`
			`mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32h_3 : kMaskSwap16to32h_2) : (threeIn ? kMask8to32h_3 : kMask8to32h_2);`
			`coords = _mm_shuffle_epi8(coords, mask);`

			`// Sign extend`
			`if (std::is_signed<T>::value)`
			`coords = _mm_srai_epi32(coords, 32 - sizeof(T) * 8);`

			`__m128 out = _mm_mul_ps(_mm_cvtepi32_ps(coords), scale);`
			`if (threeOut)`
VideoCommon: Rename s_pCurBufferPointer 2014-12-09 07:30:38 +00:00			`_mm_storeu_ps((float*)g_vertex_manager_write_ptr, out);`
Vertex Loader: SSE implementations of more position/texcoord/normal formats ~35-45% faster NFS:HP2, possibly other vertex-bound games. 2014-11-11 09:48:38 +00:00			`else`
VideoCommon: Rename s_pCurBufferPointer 2014-12-09 07:30:38 +00:00			`_mm_storel_pi((__m64*)g_vertex_manager_write_ptr, out);`
Vertex Loader: SSE implementations of more position/texcoord/normal formats ~35-45% faster NFS:HP2, possibly other vertex-bound games. 2014-11-11 09:48:38 +00:00			`}`

VideoCommon: Rename s_pCurBufferPointer 2014-12-09 07:30:38 +00:00			`g_vertex_manager_write_ptr += sizeof(float) * (2 + threeOut);`
Vertex Loader: SSE implementations of more position/texcoord/normal formats ~35-45% faster NFS:HP2, possibly other vertex-bound games. 2014-11-11 09:48:38 +00:00			`}`
VertexLoader: Move the old Datareader function into VertexLoader 2014-11-29 02:39:24 +00:00			`#endif`