GPU / SoftRasterizer: Fix a build issue for Altivec-enabled code. (Related to commits c41a006 and 43d3883.)

2018-05-17 20:06:31 -07:00 · 2018-05-17 20:06:31 -07:00 · c024a78a43
parent 9c128460c4
commit c024a78a43
2 changed files with 2339 additions and 2319 deletions
--- a/desmume/src/matrix.h
+++ b/desmume/src/matrix.h
@ -1,41 +1,41 @@
-/*  
+/*  
-	Copyright (C) 2006-2007 shash
+	Copyright (C) 2006-2007 shash
-	Copyright (C) 2007-2018 DeSmuME team
+	Copyright (C) 2007-2018 DeSmuME team
-
+
-	This file is free software: you can redistribute it and/or modify
+	This file is free software: you can redistribute it and/or modify
-	it under the terms of the GNU General Public License as published by
+	it under the terms of the GNU General Public License as published by
-	the Free Software Foundation, either version 2 of the License, or
+	the Free Software Foundation, either version 2 of the License, or
-	(at your option) any later version.
+	(at your option) any later version.
-
+
-	This file is distributed in the hope that it will be useful,
+	This file is distributed in the hope that it will be useful,
-	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
-	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-	GNU General Public License for more details.
+	GNU General Public License for more details.
-
+
-	You should have received a copy of the GNU General Public License
+	You should have received a copy of the GNU General Public License
-	along with the this software.  If not, see <http://www.gnu.org/licenses/>.
+	along with the this software.  If not, see <http://www.gnu.org/licenses/>.
-*/
+*/
-
+
-#ifndef MATRIX_H
+#ifndef MATRIX_H
-#define MATRIX_H
+#define MATRIX_H
-
+
-#include <math.h>
+#include <math.h>
-#include <string.h>
+#include <string.h>
-
+
-#include "types.h"
+#include "types.h"
-#include "mem.h"
+#include "mem.h"
-
+
-#ifdef ENABLE_SSE
+#ifdef ENABLE_SSE
-#include <xmmintrin.h>
+#include <xmmintrin.h>
-#endif
+#endif
-
+
-#ifdef ENABLE_SSE2
+#ifdef ENABLE_SSE2
-#include <emmintrin.h>
+#include <emmintrin.h>
 #endif
 #ifdef ENABLE_SSE4_1
 #include "smmintrin.h"
-#endif
+#endif
 enum MatrixMode
 {
@ -46,8 +46,8 @@ enum MatrixMode
 };
 template<MatrixMode MODE>
-struct MatrixStack
+struct MatrixStack
-{
+{
 	static const size_t size = ((MODE == MATRIXMODE_PROJECTION) || (MODE == MATRIXMODE_TEXTURE)) ? 1 : 32;
 	static const MatrixMode type = MODE;
@ -55,7 +55,7 @@ struct MatrixStack
 	u32 position;
 };
-void MatrixInit(s32 (&mtx)[16]);
+void MatrixInit(s32 (&mtx)[16]);
 void MatrixInit(float (&mtx)[16]);
 void MatrixIdentity(s32 (&mtx)[16]);
@ -70,29 +70,29 @@ void MatrixCopy(float (&mtxDst)[16], const float (&mtxSrc)[16]);
 void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]);
 int MatrixCompare(const s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]);
-int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16]);
+int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16]);
 s32	MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16]);
 float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16]);
 template<MatrixMode MODE> void MatrixStackInit(MatrixStack<MODE> *stack);
 template<MatrixMode MODE> s32* MatrixStackGet(MatrixStack<MODE> *stack);
 void Vector2Copy(float *dst, const float *src);
 void Vector2Add(float *dst, const float *src);
 void Vector2Subtract(float *dst, const float *src);
 float Vector2Dot(const float *a, const float *b);
 float Vector2Cross(const float *a, const float *b);
 float Vector3Dot(const float *a, const float *b);
 void Vector3Cross(float* dst, const float *a, const float *b);
 float Vector3Length(const float *a);
 void Vector3Add(float *dst, const float *src);
 void Vector3Subtract(float *dst, const float *src);
 void Vector3Scale(float *dst, const float scale);
 void Vector3Copy(float *dst, const float *src);
 void Vector3Normalize(float *dst);
 s32	MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16]);
 float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16]);
 template<MatrixMode MODE> void MatrixStackInit(MatrixStack<MODE> *stack);
 template<MatrixMode MODE> s32* MatrixStackGet(MatrixStack<MODE> *stack);
 void Vector2Copy(float *dst, const float *src);
 void Vector2Add(float *dst, const float *src);
 void Vector2Subtract(float *dst, const float *src);
 float Vector2Dot(const float *a, const float *b);
 float Vector2Cross(const float *a, const float *b);
 float Vector3Dot(const float *a, const float *b);
 void Vector3Cross(float* dst, const float *a, const float *b);
 float Vector3Length(const float *a);
 void Vector3Add(float *dst, const float *src);
 void Vector3Subtract(float *dst, const float *src);
 void Vector3Scale(float *dst, const float scale);
 void Vector3Copy(float *dst, const float *src);
 void Vector3Normalize(float *dst);
 void Vector4Copy(float *dst, const float *src);
@ -111,43 +111,43 @@ void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
 void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]);
 void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]);
 void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]);
-
+
-//these functions are an unreliable, inaccurate floor.
+//these functions are an unreliable, inaccurate floor.
-//it should only be used for positive numbers
+//it should only be used for positive numbers
-//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
+//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
-FORCEINLINE u32 u32floor(float f)
+FORCEINLINE u32 u32floor(float f)
-{
+{
-#ifdef ENABLE_SSE2
+#ifdef ENABLE_SSE2
-	return (u32)_mm_cvtt_ss2si(_mm_set_ss(f));
+	return (u32)_mm_cvtt_ss2si(_mm_set_ss(f));
-#else
+#else
-	return (u32)f;
+	return (u32)f;
-#endif
+#endif
-}
+}
-FORCEINLINE u32 u32floor(double d)
+FORCEINLINE u32 u32floor(double d)
-{
+{
-#ifdef ENABLE_SSE2
+#ifdef ENABLE_SSE2
-	return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
+	return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
-#else
+#else
-	return (u32)d;
+	return (u32)d;
-#endif
+#endif
-}
+}
-
+
-//same as above but works for negative values too.
+//same as above but works for negative values too.
-//be sure that the results are the same thing as floorf!
+//be sure that the results are the same thing as floorf!
-FORCEINLINE s32 s32floor(float f)
+FORCEINLINE s32 s32floor(float f)
-{
+{
-#ifdef ENABLE_SSE2
+#ifdef ENABLE_SSE2
-	return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
+	return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
-#else
+#else
-	return (s32)floorf(f);
+	return (s32)floorf(f);
-#endif
+#endif
-}
+}
-FORCEINLINE s32 s32floor(double d)
+FORCEINLINE s32 s32floor(double d)
-{
+{
-	return s32floor((float)d);
+	return s32floor((float)d);
-}
+}
-
+
-// SIMD Functions
+// SIMD Functions
 //-------------
 #if defined(ENABLE_AVX2)
@ -188,46 +188,46 @@ static void memset_u32_fast(void *dst, const u32 val)
 	const v256u32 val_vec256 = _mm256_set1_epi32(val);
 	MACRODO_N(ELEMENTCOUNT / (sizeof(v256u32) / sizeof(u32)), _mm256_store_si256(dst_vec256 + (X), val_vec256));
 }
-
+
-#elif defined(ENABLE_SSE2)
+#elif defined(ENABLE_SSE2)
-
+
-static void memset_u16(void *dst, const u16 val, const size_t elementCount)
+static void memset_u16(void *dst, const u16 val, const size_t elementCount)
-{
+{
-	v128u16 *dst_vec128 = (v128u16 *)dst;
+	v128u16 *dst_vec128 = (v128u16 *)dst;
 	const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
-	const v128u16 val_vec128 = _mm_set1_epi16(val);
+	const v128u16 val_vec128 = _mm_set1_epi16(val);
-	for (size_t i = 0; i < length_vec128; i++)
+	for (size_t i = 0; i < length_vec128; i++)
-		_mm_stream_si128(dst_vec128 + i, val_vec128);
+		_mm_stream_si128(dst_vec128 + i, val_vec128);
-}
+}
-
+
-template <size_t ELEMENTCOUNT>
+template <size_t ELEMENTCOUNT>
-static void memset_u16_fast(void *dst, const u16 val)
+static void memset_u16_fast(void *dst, const u16 val)
-{
+{
 	v128u16 *dst_vec128 = (v128u16 *)dst;
 	const v128u16 val_vec128 = _mm_set1_epi16(val);
 	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), _mm_store_si128(dst_vec128 + (X), val_vec128));
 }
 static void memset_u32(void *dst, const u32 val, const size_t elementCount)
 {
 	v128u32 *dst_vec128 = (v128u32 *)dst;
 	const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
-	const v128u32 val_vec128 = _mm_set1_epi32(val);
+	const v128u16 val_vec128 = _mm_set1_epi16(val);
-	for (size_t i = 0; i < length_vec128; i++)
+	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), _mm_store_si128(dst_vec128 + (X), val_vec128));
-		_mm_stream_si128(dst_vec128 + i, val_vec128);
+}
-}
+
-
+static void memset_u32(void *dst, const u32 val, const size_t elementCount)
-template <size_t ELEMENTCOUNT>
+{
 static void memset_u32_fast(void *dst, const u32 val)
 {
 	v128u32 *dst_vec128 = (v128u32 *)dst;
-	
+	const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
-	const v128u32 val_vec128 = _mm_set1_epi32(val);
+	
-	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), _mm_store_si128(dst_vec128 + (X), val_vec128));
+	const v128u32 val_vec128 = _mm_set1_epi32(val);
-}
+	for (size_t i = 0; i < length_vec128; i++)
 		_mm_stream_si128(dst_vec128 + i, val_vec128);
 }
 template <size_t ELEMENTCOUNT>
 static void memset_u32_fast(void *dst, const u32 val)
 {
 	v128u32 *dst_vec128 = (v128u32 *)dst;
 	const v128u32 val_vec128 = _mm_set1_epi32(val);
 	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), _mm_store_si128(dst_vec128 + (X), val_vec128));
 }
 #elif defined(ENABLE_ALTIVEC)
@ -236,7 +236,7 @@ static void memset_u16(void *dst, const u16 val, const size_t elementCount)
 	v128u16 *dst_vec128 = (v128u16 *)dst;
 	const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
-	const v128u16 val_vec128 = vec_splat_u16(val);
+	const v128u16 val_vec128 = (v128u16){val,val,val,val,val,val,val,val};
 	for (size_t i = 0; i < length_vec128; i++)
 		vec_st(val_vec128, 0, dst_vec128 + i);
 }
@ -246,7 +246,7 @@ static void memset_u16_fast(void *dst, const u16 val)
 {
 	v128u16 *dst_vec128 = (v128u16 *)dst;
-	const v128u16 val_vec128 = vec_splat_u16(val);
+	const v128u16 val_vec128 = (v128u16){val,val,val,val,val,val,val,val};
 	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), vec_st(val_vec128, 0, dst_vec128 + (X)));
 }
@ -255,7 +255,7 @@ static void memset_u32(void *dst, const u32 val, const size_t elementCount)
 	v128u32 *dst_vec128 = (v128u32 *)dst;
 	const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
-	const v128u32 val_vec128 = vec_splat_u32(val);
+	const v128u32 val_vec128 = (v128u32){val,val,val,val};
 	for (size_t i = 0; i < length_vec128; i++)
 		vec_st(val_vec128, 0, dst_vec128 + i);
 }
@ -265,68 +265,68 @@ static void memset_u32_fast(void *dst, const u32 val)
 {
 	v128u32 *dst_vec128 = (v128u32 *)dst;
-	const v128u32 val_vec128 = vec_splat_u32(val);
+	const v128u32 val_vec128 = (v128u32){val,val,val,val};
 	MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
 }
-
+
-#else // No SIMD
+#else // No SIMD
-
+
-static void memset_u16(void *dst, const u16 val, const size_t elementCount)
+static void memset_u16(void *dst, const u16 val, const size_t elementCount)
-{
+{
-#ifdef HOST_64
+#ifdef HOST_64
-	u64 *dst_u64 = (u64 *)dst;
+	u64 *dst_u64 = (u64 *)dst;
-	const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
+	const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
-	const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val));
+	const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val));
-	
+	
-	for (size_t i = 0; i < length_u64; i++)
+	for (size_t i = 0; i < length_u64; i++)
-		dst_u64[i] = val_u64;
+		dst_u64[i] = val_u64;
-#else
+#else
-	for (size_t i = 0; i < elementCount; i++)
+	for (size_t i = 0; i < elementCount; i++)
-		((u16 *)dst)[i] = val;
+		((u16 *)dst)[i] = val;
-#endif
+#endif
-}
+}
-
+
-template <size_t ELEMENTCOUNT>
+template <size_t ELEMENTCOUNT>
-static void memset_u16_fast(void *dst, const u16 val)
+static void memset_u16_fast(void *dst, const u16 val)
-{
+{
-#ifdef HOST_64
+#ifdef HOST_64
-	u64 *dst_u64 = (u64 *)dst;
+	u64 *dst_u64 = (u64 *)dst;
-	const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
+	const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
-	MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
+	MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
-#else
+#else
-	for (size_t i = 0; i < ELEMENTCOUNT; i++)
+	for (size_t i = 0; i < ELEMENTCOUNT; i++)
-		((u16 *)dst)[i] = val;
+		((u16 *)dst)[i] = val;
-#endif
+#endif
-}
+}
-
+
-static void memset_u32(void *dst, const u32 val, const size_t elementCount)
+static void memset_u32(void *dst, const u32 val, const size_t elementCount)
-{
+{
-#ifdef HOST_64
+#ifdef HOST_64
-	u64 *dst_u64 = (u64 *)dst;
+	u64 *dst_u64 = (u64 *)dst;
-	const u64 val_u64 = ((u64)val << 32) | (u64)val;
+	const u64 val_u64 = ((u64)val << 32) | (u64)val;
-	const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val));
+	const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val));
-	
+	
-	for (size_t i = 0; i < length_u64; i++)
+	for (size_t i = 0; i < length_u64; i++)
-		dst_u64[i] = val_u64;
+		dst_u64[i] = val_u64;
-#else
+#else
-	for (size_t i = 0; i < elementCount; i++)
+	for (size_t i = 0; i < elementCount; i++)
-		((u32 *)dst)[i] = val;
+		((u32 *)dst)[i] = val;
-#endif
+#endif
-}
+}
-
+
-template <size_t ELEMENTCOUNT>
+template <size_t ELEMENTCOUNT>
-static void memset_u32_fast(void *dst, const u32 val)
+static void memset_u32_fast(void *dst, const u32 val)
-{
+{
-#ifdef HOST_64
+#ifdef HOST_64
-	u64 *dst_u64 = (u64 *)dst;
+	u64 *dst_u64 = (u64 *)dst;
-	const u64 val_u64 = ((u64)val << 32) | (u64)val;
+	const u64 val_u64 = ((u64)val << 32) | (u64)val;
-	MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
+	MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
-#else
+#else
-	for (size_t i = 0; i < ELEMENTCOUNT; i++)
+	for (size_t i = 0; i < ELEMENTCOUNT; i++)
-		((u16 *)dst)[i] = val;
+		((u16 *)dst)[i] = val;
-#endif
+#endif
-}
+}
-
+
-#endif // SIMD Functions
+#endif // SIMD Functions
-
+
-#endif // MATRIX_H
+#endif // MATRIX_H
--- a/desmume/src/rasterize.cpp
+++ b/desmume/src/rasterize.cpp