[Linux] Force passing XMM values by pointer

This commit is contained in:
uytvbn 2017-10-24 22:49:31 +02:00 committed by Sandy Carter
parent 41b0aa9ffd
commit c804b0dff9
2 changed files with 20 additions and 20 deletions

View File

@ -671,7 +671,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
// OPCODE_VECTOR_SHL
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
static __m128i EmulateVectorShl(void*, __m128i& src1, __m128i& src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
@ -863,7 +863,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
// OPCODE_VECTOR_SHR
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
static __m128i EmulateVectorShr(void*, __m128i& src1, __m128i& src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
@ -1199,7 +1199,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
// OPCODE_VECTOR_ROTATE_LEFT
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
static __m128i EmulateVectorRotateLeft(void*, __m128i& src1, __m128i& src2) {
alignas(16) T value[16 / sizeof(T)];
alignas(16) T shamt[16 / sizeof(T)];
@ -1289,7 +1289,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
// OPCODE_VECTOR_AVERAGE
// ============================================================================
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) {
static __m128i EmulateVectorAverage(void*, __m128i& src1, __m128i& src2) {
alignas(16) T src1v[16 / sizeof(T)];
alignas(16) T src2v[16 / sizeof(T)];
alignas(16) T value[16 / sizeof(T)];
@ -1857,7 +1857,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
}
static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
static __m128i EmulateFLOAT16_2(void*, __m128& src1) {
alignas(16) float a[4];
alignas(16) uint16_t b[8];
_mm_store_ps(a, src1);
@ -1898,7 +1898,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
e.vmovaps(i.dest, e.xmm0);
}
}
static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
static __m128i EmulateFLOAT16_4(void*, __m128& src1) {
alignas(16) float a[4];
alignas(16) uint16_t b[8];
_mm_store_ps(a, src1);
@ -2031,8 +2031,8 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
// Merge XZ and YW.
e.vorps(i.dest, e.xmm0);
}
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
__m128i src2) {
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i& src1,
__m128i& src2) {
alignas(16) uint16_t a[8];
alignas(16) uint16_t b[8];
alignas(16) uint8_t c[16];
@ -2044,7 +2044,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
}
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
}
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i& src1, __m128i& src2) {
alignas(16) uint8_t a[16];
alignas(16) uint8_t b[16];
alignas(16) uint8_t c[16];
@ -2277,7 +2277,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
}
static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
static __m128 EmulateFLOAT16_2(void*, __m128i& src1) {
alignas(16) uint16_t a[8];
alignas(16) float b[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
@ -2336,7 +2336,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.vmovaps(i.dest, e.xmm0);
}
}
static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
static __m128 EmulateFLOAT16_4(void*, __m128i& src1) {
alignas(16) uint16_t a[8];
alignas(16) float b[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
@ -2616,4 +2616,4 @@ EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK);
} // namespace x64
} // namespace backend
} // namespace cpu
} // namespace xe
} // namespace xe

View File

@ -2352,7 +2352,7 @@ EMITTER_OPCODE_TABLE(OPCODE_RECIP, RECIP_F32, RECIP_F64, RECIP_V128);
// TODO(benvanik): use approx here:
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
static __m128 EmulatePow2(void*, __m128 src) {
static __m128 EmulatePow2(void*, __m128& src) {
float src_value;
_mm_store_ss(&src_value, src);
float result = std::exp2(src_value);
@ -2366,7 +2366,7 @@ struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
}
};
struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
static __m128d EmulatePow2(void*, __m128d src) {
static __m128d EmulatePow2(void*, __m128d& src) {
double src_value;
_mm_store_sd(&src_value, src);
double result = std::exp2(src_value);
@ -2380,7 +2380,7 @@ struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
}
};
struct POW2_V128 : Sequence<POW2_V128, I<OPCODE_POW2, V128Op, V128Op>> {
static __m128 EmulatePow2(void*, __m128 src) {
static __m128 EmulatePow2(void*, __m128& src) {
alignas(16) float values[4];
_mm_store_ps(values, src);
for (size_t i = 0; i < 4; ++i) {
@ -2403,7 +2403,7 @@ EMITTER_OPCODE_TABLE(OPCODE_POW2, POW2_F32, POW2_F64, POW2_V128);
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it!
struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
static __m128 EmulateLog2(void*, __m128 src) {
static __m128 EmulateLog2(void*, __m128& src) {
float src_value;
_mm_store_ss(&src_value, src);
float result = std::log2(src_value);
@ -2417,7 +2417,7 @@ struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
}
};
struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
static __m128d EmulateLog2(void*, __m128d src) {
static __m128d EmulateLog2(void*, __m128d& src) {
double src_value;
_mm_store_sd(&src_value, src);
double result = std::log2(src_value);
@ -2431,7 +2431,7 @@ struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
}
};
struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
static __m128 EmulateLog2(void*, __m128 src) {
static __m128 EmulateLog2(void*, __m128& src) {
alignas(16) float values[4];
_mm_store_ps(values, src);
for (size_t i = 0; i < 4; ++i) {
@ -2713,7 +2713,7 @@ struct SHL_V128 : Sequence<SHL_V128, I<OPCODE_SHL, V128Op, V128Op, I8Op>> {
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShlV128));
e.vmovaps(i.dest, e.xmm0);
}
static __m128i EmulateShlV128(void*, __m128i src1, uint8_t src2) {
static __m128i EmulateShlV128(void*, __m128i& src1, uint8_t src2) {
// Almost all instances are shamt = 1, but non-constant.
// shamt is [0,7]
uint8_t shamt = src2 & 0x7;
@ -2790,7 +2790,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
e.vmovaps(i.dest, e.xmm0);
}
static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
static __m128i EmulateShrV128(void*, __m128i& src1, uint8_t src2) {
// Almost all instances are shamt = 1, but non-constant.
// shamt is [0,7]
uint8_t shamt = src2 & 0x7;