GPU / SoftRasterizer: Fix a build issue for Altivec-enabled code. (Related to commits c41a006 and 43d3883.)

This commit is contained in:
rogerman 2018-05-17 20:06:31 -07:00
parent 9c128460c4
commit c024a78a43
2 changed files with 2339 additions and 2319 deletions

View File

@ -1,41 +1,41 @@
/* /*
Copyright (C) 2006-2007 shash Copyright (C) 2006-2007 shash
Copyright (C) 2007-2018 DeSmuME team Copyright (C) 2007-2018 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License, or the Free Software Foundation, either version 2 of the License, or
(at your option) any later version. (at your option) any later version.
This file is distributed in the hope that it will be useful, This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details. GNU General Public License for more details.
You should have received a copy of the GNU General Public License You should have received a copy of the GNU General Public License
along with the this software. If not, see <http://www.gnu.org/licenses/>. along with the this software. If not, see <http://www.gnu.org/licenses/>.
*/ */
#ifndef MATRIX_H #ifndef MATRIX_H
#define MATRIX_H #define MATRIX_H
#include <math.h> #include <math.h>
#include <string.h> #include <string.h>
#include "types.h" #include "types.h"
#include "mem.h" #include "mem.h"
#ifdef ENABLE_SSE #ifdef ENABLE_SSE
#include <xmmintrin.h> #include <xmmintrin.h>
#endif #endif
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#ifdef ENABLE_SSE4_1 #ifdef ENABLE_SSE4_1
#include "smmintrin.h" #include "smmintrin.h"
#endif #endif
enum MatrixMode enum MatrixMode
{ {
@ -46,8 +46,8 @@ enum MatrixMode
}; };
template<MatrixMode MODE> template<MatrixMode MODE>
struct MatrixStack struct MatrixStack
{ {
static const size_t size = ((MODE == MATRIXMODE_PROJECTION) || (MODE == MATRIXMODE_TEXTURE)) ? 1 : 32; static const size_t size = ((MODE == MATRIXMODE_PROJECTION) || (MODE == MATRIXMODE_TEXTURE)) ? 1 : 32;
static const MatrixMode type = MODE; static const MatrixMode type = MODE;
@ -55,7 +55,7 @@ struct MatrixStack
u32 position; u32 position;
}; };
void MatrixInit(s32 (&mtx)[16]); void MatrixInit(s32 (&mtx)[16]);
void MatrixInit(float (&mtx)[16]); void MatrixInit(float (&mtx)[16]);
void MatrixIdentity(s32 (&mtx)[16]); void MatrixIdentity(s32 (&mtx)[16]);
@ -70,29 +70,29 @@ void MatrixCopy(float (&mtxDst)[16], const float (&mtxSrc)[16]);
void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]); void MatrixCopy(float (&__restrict mtxDst)[16], const s32 (&__restrict mtxSrc)[16]);
int MatrixCompare(const s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]); int MatrixCompare(const s32 (&mtxDst)[16], const s32 (&mtxSrc)[16]);
int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16]); int MatrixCompare(const float (&mtxDst)[16], const float (&mtxSrc)[16]);
s32 MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16]);
float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16]);
template<MatrixMode MODE> void MatrixStackInit(MatrixStack<MODE> *stack);
template<MatrixMode MODE> s32* MatrixStackGet(MatrixStack<MODE> *stack);
void Vector2Copy(float *dst, const float *src);
void Vector2Add(float *dst, const float *src);
void Vector2Subtract(float *dst, const float *src);
float Vector2Dot(const float *a, const float *b);
float Vector2Cross(const float *a, const float *b);
float Vector3Dot(const float *a, const float *b);
void Vector3Cross(float* dst, const float *a, const float *b);
float Vector3Length(const float *a);
void Vector3Add(float *dst, const float *src);
void Vector3Subtract(float *dst, const float *src);
void Vector3Scale(float *dst, const float scale);
void Vector3Copy(float *dst, const float *src);
void Vector3Normalize(float *dst);
s32 MatrixGetMultipliedIndex(const u32 index, const s32 (&mtxA)[16], const s32 (&mtxB)[16]);
float MatrixGetMultipliedIndex(const u32 index, const float (&mtxA)[16], const float (&mtxB)[16]);
template<MatrixMode MODE> void MatrixStackInit(MatrixStack<MODE> *stack);
template<MatrixMode MODE> s32* MatrixStackGet(MatrixStack<MODE> *stack);
void Vector2Copy(float *dst, const float *src);
void Vector2Add(float *dst, const float *src);
void Vector2Subtract(float *dst, const float *src);
float Vector2Dot(const float *a, const float *b);
float Vector2Cross(const float *a, const float *b);
float Vector3Dot(const float *a, const float *b);
void Vector3Cross(float* dst, const float *a, const float *b);
float Vector3Length(const float *a);
void Vector3Add(float *dst, const float *src);
void Vector3Subtract(float *dst, const float *src);
void Vector3Scale(float *dst, const float scale);
void Vector3Copy(float *dst, const float *src);
void Vector3Normalize(float *dst);
void Vector4Copy(float *dst, const float *src); void Vector4Copy(float *dst, const float *src);
@ -111,43 +111,43 @@ void MatrixMultVec3x3(const s32 (&__restrict mtx)[16], s32 (&__restrict vec)[4])
void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]); void MatrixTranslate(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]);
void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]); void MatrixScale(s32 (&__restrict mtx)[16], const s32 (&__restrict vec)[4]);
void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]); void MatrixMultiply(s32 (&__restrict mtxA)[16], const s32 (&__restrict mtxB)[16]);
//these functions are an unreliable, inaccurate floor. //these functions are an unreliable, inaccurate floor.
//it should only be used for positive numbers //it should only be used for positive numbers
//this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available //this isnt as fast as it could be if we used a visual c++ intrinsic, but those appear not to be universally available
FORCEINLINE u32 u32floor(float f) FORCEINLINE u32 u32floor(float f)
{ {
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
return (u32)_mm_cvtt_ss2si(_mm_set_ss(f)); return (u32)_mm_cvtt_ss2si(_mm_set_ss(f));
#else #else
return (u32)f; return (u32)f;
#endif #endif
} }
FORCEINLINE u32 u32floor(double d) FORCEINLINE u32 u32floor(double d)
{ {
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
return (u32)_mm_cvttsd_si32(_mm_set_sd(d)); return (u32)_mm_cvttsd_si32(_mm_set_sd(d));
#else #else
return (u32)d; return (u32)d;
#endif #endif
} }
//same as above but works for negative values too. //same as above but works for negative values too.
//be sure that the results are the same thing as floorf! //be sure that the results are the same thing as floorf!
FORCEINLINE s32 s32floor(float f) FORCEINLINE s32 s32floor(float f)
{ {
#ifdef ENABLE_SSE2 #ifdef ENABLE_SSE2
return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1; return _mm_cvtss_si32( _mm_add_ss(_mm_set_ss(-0.5f),_mm_add_ss(_mm_set_ss(f), _mm_set_ss(f))) ) >> 1;
#else #else
return (s32)floorf(f); return (s32)floorf(f);
#endif #endif
} }
FORCEINLINE s32 s32floor(double d) FORCEINLINE s32 s32floor(double d)
{ {
return s32floor((float)d); return s32floor((float)d);
} }
// SIMD Functions // SIMD Functions
//------------- //-------------
#if defined(ENABLE_AVX2) #if defined(ENABLE_AVX2)
@ -188,46 +188,46 @@ static void memset_u32_fast(void *dst, const u32 val)
const v256u32 val_vec256 = _mm256_set1_epi32(val); const v256u32 val_vec256 = _mm256_set1_epi32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v256u32) / sizeof(u32)), _mm256_store_si256(dst_vec256 + (X), val_vec256)); MACRODO_N(ELEMENTCOUNT / (sizeof(v256u32) / sizeof(u32)), _mm256_store_si256(dst_vec256 + (X), val_vec256));
} }
#elif defined(ENABLE_SSE2) #elif defined(ENABLE_SSE2)
static void memset_u16(void *dst, const u16 val, const size_t elementCount) static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{ {
v128u16 *dst_vec128 = (v128u16 *)dst; v128u16 *dst_vec128 = (v128u16 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16)); const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
const v128u16 val_vec128 = _mm_set1_epi16(val); const v128u16 val_vec128 = _mm_set1_epi16(val);
for (size_t i = 0; i < length_vec128; i++) for (size_t i = 0; i < length_vec128; i++)
_mm_stream_si128(dst_vec128 + i, val_vec128); _mm_stream_si128(dst_vec128 + i, val_vec128);
} }
template <size_t ELEMENTCOUNT> template <size_t ELEMENTCOUNT>
static void memset_u16_fast(void *dst, const u16 val) static void memset_u16_fast(void *dst, const u16 val)
{ {
v128u16 *dst_vec128 = (v128u16 *)dst; v128u16 *dst_vec128 = (v128u16 *)dst;
const v128u16 val_vec128 = _mm_set1_epi16(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), _mm_store_si128(dst_vec128 + (X), val_vec128));
}
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
{
v128u32 *dst_vec128 = (v128u32 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
const v128u32 val_vec128 = _mm_set1_epi32(val); const v128u16 val_vec128 = _mm_set1_epi16(val);
for (size_t i = 0; i < length_vec128; i++) MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), _mm_store_si128(dst_vec128 + (X), val_vec128));
_mm_stream_si128(dst_vec128 + i, val_vec128); }
}
static void memset_u32(void *dst, const u32 val, const size_t elementCount)
template <size_t ELEMENTCOUNT> {
static void memset_u32_fast(void *dst, const u32 val)
{
v128u32 *dst_vec128 = (v128u32 *)dst; v128u32 *dst_vec128 = (v128u32 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
const v128u32 val_vec128 = _mm_set1_epi32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), _mm_store_si128(dst_vec128 + (X), val_vec128)); const v128u32 val_vec128 = _mm_set1_epi32(val);
} for (size_t i = 0; i < length_vec128; i++)
_mm_stream_si128(dst_vec128 + i, val_vec128);
}
template <size_t ELEMENTCOUNT>
static void memset_u32_fast(void *dst, const u32 val)
{
v128u32 *dst_vec128 = (v128u32 *)dst;
const v128u32 val_vec128 = _mm_set1_epi32(val);
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), _mm_store_si128(dst_vec128 + (X), val_vec128));
}
#elif defined(ENABLE_ALTIVEC) #elif defined(ENABLE_ALTIVEC)
@ -236,7 +236,7 @@ static void memset_u16(void *dst, const u16 val, const size_t elementCount)
v128u16 *dst_vec128 = (v128u16 *)dst; v128u16 *dst_vec128 = (v128u16 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16)); const size_t length_vec128 = elementCount / (sizeof(v128u16) / sizeof(u16));
const v128u16 val_vec128 = vec_splat_u16(val); const v128u16 val_vec128 = (v128u16){val,val,val,val,val,val,val,val};
for (size_t i = 0; i < length_vec128; i++) for (size_t i = 0; i < length_vec128; i++)
vec_st(val_vec128, 0, dst_vec128 + i); vec_st(val_vec128, 0, dst_vec128 + i);
} }
@ -246,7 +246,7 @@ static void memset_u16_fast(void *dst, const u16 val)
{ {
v128u16 *dst_vec128 = (v128u16 *)dst; v128u16 *dst_vec128 = (v128u16 *)dst;
const v128u16 val_vec128 = vec_splat_u16(val); const v128u16 val_vec128 = (v128u16){val,val,val,val,val,val,val,val};
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), vec_st(val_vec128, 0, dst_vec128 + (X))); MACRODO_N(ELEMENTCOUNT / (sizeof(v128u16) / sizeof(u16)), vec_st(val_vec128, 0, dst_vec128 + (X)));
} }
@ -255,7 +255,7 @@ static void memset_u32(void *dst, const u32 val, const size_t elementCount)
v128u32 *dst_vec128 = (v128u32 *)dst; v128u32 *dst_vec128 = (v128u32 *)dst;
const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32)); const size_t length_vec128 = elementCount / (sizeof(v128u32) / sizeof(u32));
const v128u32 val_vec128 = vec_splat_u32(val); const v128u32 val_vec128 = (v128u32){val,val,val,val};
for (size_t i = 0; i < length_vec128; i++) for (size_t i = 0; i < length_vec128; i++)
vec_st(val_vec128, 0, dst_vec128 + i); vec_st(val_vec128, 0, dst_vec128 + i);
} }
@ -265,68 +265,68 @@ static void memset_u32_fast(void *dst, const u32 val)
{ {
v128u32 *dst_vec128 = (v128u32 *)dst; v128u32 *dst_vec128 = (v128u32 *)dst;
const v128u32 val_vec128 = vec_splat_u32(val); const v128u32 val_vec128 = (v128u32){val,val,val,val};
MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X))); MACRODO_N(ELEMENTCOUNT / (sizeof(v128u32) / sizeof(u32)), vec_st(val_vec128, 0, dst_vec128 + (X)));
} }
#else // No SIMD #else // No SIMD
static void memset_u16(void *dst, const u16 val, const size_t elementCount) static void memset_u16(void *dst, const u16 val, const size_t elementCount)
{ {
#ifdef HOST_64 #ifdef HOST_64
u64 *dst_u64 = (u64 *)dst; u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val; const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val)); const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val));
for (size_t i = 0; i < length_u64; i++) for (size_t i = 0; i < length_u64; i++)
dst_u64[i] = val_u64; dst_u64[i] = val_u64;
#else #else
for (size_t i = 0; i < elementCount; i++) for (size_t i = 0; i < elementCount; i++)
((u16 *)dst)[i] = val; ((u16 *)dst)[i] = val;
#endif #endif
} }
template <size_t ELEMENTCOUNT> template <size_t ELEMENTCOUNT>
static void memset_u16_fast(void *dst, const u16 val) static void memset_u16_fast(void *dst, const u16 val)
{ {
#ifdef HOST_64 #ifdef HOST_64
u64 *dst_u64 = (u64 *)dst; u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val; const u64 val_u64 = ((u64)val << 48) | ((u64)val << 32) | ((u64)val << 16) | (u64)val;
MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64)); MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
#else #else
for (size_t i = 0; i < ELEMENTCOUNT; i++) for (size_t i = 0; i < ELEMENTCOUNT; i++)
((u16 *)dst)[i] = val; ((u16 *)dst)[i] = val;
#endif #endif
} }
static void memset_u32(void *dst, const u32 val, const size_t elementCount) static void memset_u32(void *dst, const u32 val, const size_t elementCount)
{ {
#ifdef HOST_64 #ifdef HOST_64
u64 *dst_u64 = (u64 *)dst; u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 32) | (u64)val; const u64 val_u64 = ((u64)val << 32) | (u64)val;
const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val)); const size_t length_u64 = elementCount / (sizeof(val_u64) / sizeof(val));
for (size_t i = 0; i < length_u64; i++) for (size_t i = 0; i < length_u64; i++)
dst_u64[i] = val_u64; dst_u64[i] = val_u64;
#else #else
for (size_t i = 0; i < elementCount; i++) for (size_t i = 0; i < elementCount; i++)
((u32 *)dst)[i] = val; ((u32 *)dst)[i] = val;
#endif #endif
} }
template <size_t ELEMENTCOUNT> template <size_t ELEMENTCOUNT>
static void memset_u32_fast(void *dst, const u32 val) static void memset_u32_fast(void *dst, const u32 val)
{ {
#ifdef HOST_64 #ifdef HOST_64
u64 *dst_u64 = (u64 *)dst; u64 *dst_u64 = (u64 *)dst;
const u64 val_u64 = ((u64)val << 32) | (u64)val; const u64 val_u64 = ((u64)val << 32) | (u64)val;
MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64)); MACRODO_N(ELEMENTCOUNT / (sizeof(val_u64) / sizeof(val)), (dst_u64[(X)] = val_u64));
#else #else
for (size_t i = 0; i < ELEMENTCOUNT; i++) for (size_t i = 0; i < ELEMENTCOUNT; i++)
((u16 *)dst)[i] = val; ((u16 *)dst)[i] = val;
#endif #endif
} }
#endif // SIMD Functions #endif // SIMD Functions
#endif // MATRIX_H #endif // MATRIX_H

File diff suppressed because it is too large Load Diff