[Linux] Force passing XMM values by pointer
This commit is contained in:
parent
41b0aa9ffd
commit
c804b0dff9
|
@ -671,7 +671,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB);
|
||||||
// OPCODE_VECTOR_SHL
|
// OPCODE_VECTOR_SHL
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||||
static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) {
|
static __m128i EmulateVectorShl(void*, __m128i& src1, __m128i& src2) {
|
||||||
alignas(16) T value[16 / sizeof(T)];
|
alignas(16) T value[16 / sizeof(T)];
|
||||||
alignas(16) T shamt[16 / sizeof(T)];
|
alignas(16) T shamt[16 / sizeof(T)];
|
||||||
|
|
||||||
|
@ -863,7 +863,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128);
|
||||||
// OPCODE_VECTOR_SHR
|
// OPCODE_VECTOR_SHR
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||||
static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) {
|
static __m128i EmulateVectorShr(void*, __m128i& src1, __m128i& src2) {
|
||||||
alignas(16) T value[16 / sizeof(T)];
|
alignas(16) T value[16 / sizeof(T)];
|
||||||
alignas(16) T shamt[16 / sizeof(T)];
|
alignas(16) T shamt[16 / sizeof(T)];
|
||||||
|
|
||||||
|
@ -1199,7 +1199,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);
|
||||||
// OPCODE_VECTOR_ROTATE_LEFT
|
// OPCODE_VECTOR_ROTATE_LEFT
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||||
static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) {
|
static __m128i EmulateVectorRotateLeft(void*, __m128i& src1, __m128i& src2) {
|
||||||
alignas(16) T value[16 / sizeof(T)];
|
alignas(16) T value[16 / sizeof(T)];
|
||||||
alignas(16) T shamt[16 / sizeof(T)];
|
alignas(16) T shamt[16 / sizeof(T)];
|
||||||
|
|
||||||
|
@ -1289,7 +1289,7 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128);
|
||||||
// OPCODE_VECTOR_AVERAGE
|
// OPCODE_VECTOR_AVERAGE
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
template <typename T, std::enable_if_t<std::is_integral<T>::value, int> = 0>
|
||||||
static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) {
|
static __m128i EmulateVectorAverage(void*, __m128i& src1, __m128i& src2) {
|
||||||
alignas(16) T src1v[16 / sizeof(T)];
|
alignas(16) T src1v[16 / sizeof(T)];
|
||||||
alignas(16) T src2v[16 / sizeof(T)];
|
alignas(16) T src2v[16 / sizeof(T)];
|
||||||
alignas(16) T value[16 / sizeof(T)];
|
alignas(16) T value[16 / sizeof(T)];
|
||||||
|
@ -1857,7 +1857,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
|
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
|
||||||
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
|
||||||
}
|
}
|
||||||
static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
|
static __m128i EmulateFLOAT16_2(void*, __m128& src1) {
|
||||||
alignas(16) float a[4];
|
alignas(16) float a[4];
|
||||||
alignas(16) uint16_t b[8];
|
alignas(16) uint16_t b[8];
|
||||||
_mm_store_ps(a, src1);
|
_mm_store_ps(a, src1);
|
||||||
|
@ -1898,7 +1898,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static __m128i EmulateFLOAT16_4(void*, __m128 src1) {
|
static __m128i EmulateFLOAT16_4(void*, __m128& src1) {
|
||||||
alignas(16) float a[4];
|
alignas(16) float a[4];
|
||||||
alignas(16) uint16_t b[8];
|
alignas(16) uint16_t b[8];
|
||||||
_mm_store_ps(a, src1);
|
_mm_store_ps(a, src1);
|
||||||
|
@ -2031,8 +2031,8 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
// Merge XZ and YW.
|
// Merge XZ and YW.
|
||||||
e.vorps(i.dest, e.xmm0);
|
e.vorps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
|
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i& src1,
|
||||||
__m128i src2) {
|
__m128i& src2) {
|
||||||
alignas(16) uint16_t a[8];
|
alignas(16) uint16_t a[8];
|
||||||
alignas(16) uint16_t b[8];
|
alignas(16) uint16_t b[8];
|
||||||
alignas(16) uint8_t c[16];
|
alignas(16) uint8_t c[16];
|
||||||
|
@ -2044,7 +2044,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
}
|
}
|
||||||
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
}
|
}
|
||||||
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i src1, __m128i src2) {
|
static __m128i EmulatePack8_IN_16_UN_UN(void*, __m128i& src1, __m128i& src2) {
|
||||||
alignas(16) uint8_t a[16];
|
alignas(16) uint8_t a[16];
|
||||||
alignas(16) uint8_t b[16];
|
alignas(16) uint8_t b[16];
|
||||||
alignas(16) uint8_t c[16];
|
alignas(16) uint8_t c[16];
|
||||||
|
@ -2277,7 +2277,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
|
e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
|
||||||
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
|
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
|
||||||
}
|
}
|
||||||
static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
|
static __m128 EmulateFLOAT16_2(void*, __m128i& src1) {
|
||||||
alignas(16) uint16_t a[8];
|
alignas(16) uint16_t a[8];
|
||||||
alignas(16) float b[4];
|
alignas(16) float b[4];
|
||||||
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||||
|
@ -2336,7 +2336,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
|
static __m128 EmulateFLOAT16_4(void*, __m128i& src1) {
|
||||||
alignas(16) uint16_t a[8];
|
alignas(16) uint16_t a[8];
|
||||||
alignas(16) float b[4];
|
alignas(16) float b[4];
|
||||||
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||||
|
@ -2616,4 +2616,4 @@ EMITTER_OPCODE_TABLE(OPCODE_UNPACK, UNPACK);
|
||||||
} // namespace x64
|
} // namespace x64
|
||||||
} // namespace backend
|
} // namespace backend
|
||||||
} // namespace cpu
|
} // namespace cpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -2352,7 +2352,7 @@ EMITTER_OPCODE_TABLE(OPCODE_RECIP, RECIP_F32, RECIP_F64, RECIP_V128);
|
||||||
// TODO(benvanik): use approx here:
|
// TODO(benvanik): use approx here:
|
||||||
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
||||||
struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
|
struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
|
||||||
static __m128 EmulatePow2(void*, __m128 src) {
|
static __m128 EmulatePow2(void*, __m128& src) {
|
||||||
float src_value;
|
float src_value;
|
||||||
_mm_store_ss(&src_value, src);
|
_mm_store_ss(&src_value, src);
|
||||||
float result = std::exp2(src_value);
|
float result = std::exp2(src_value);
|
||||||
|
@ -2366,7 +2366,7 @@ struct POW2_F32 : Sequence<POW2_F32, I<OPCODE_POW2, F32Op, F32Op>> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
|
struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
|
||||||
static __m128d EmulatePow2(void*, __m128d src) {
|
static __m128d EmulatePow2(void*, __m128d& src) {
|
||||||
double src_value;
|
double src_value;
|
||||||
_mm_store_sd(&src_value, src);
|
_mm_store_sd(&src_value, src);
|
||||||
double result = std::exp2(src_value);
|
double result = std::exp2(src_value);
|
||||||
|
@ -2380,7 +2380,7 @@ struct POW2_F64 : Sequence<POW2_F64, I<OPCODE_POW2, F64Op, F64Op>> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct POW2_V128 : Sequence<POW2_V128, I<OPCODE_POW2, V128Op, V128Op>> {
|
struct POW2_V128 : Sequence<POW2_V128, I<OPCODE_POW2, V128Op, V128Op>> {
|
||||||
static __m128 EmulatePow2(void*, __m128 src) {
|
static __m128 EmulatePow2(void*, __m128& src) {
|
||||||
alignas(16) float values[4];
|
alignas(16) float values[4];
|
||||||
_mm_store_ps(values, src);
|
_mm_store_ps(values, src);
|
||||||
for (size_t i = 0; i < 4; ++i) {
|
for (size_t i = 0; i < 4; ++i) {
|
||||||
|
@ -2403,7 +2403,7 @@ EMITTER_OPCODE_TABLE(OPCODE_POW2, POW2_F32, POW2_F64, POW2_V128);
|
||||||
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
// https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
|
||||||
// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it!
|
// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it!
|
||||||
struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
|
struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
|
||||||
static __m128 EmulateLog2(void*, __m128 src) {
|
static __m128 EmulateLog2(void*, __m128& src) {
|
||||||
float src_value;
|
float src_value;
|
||||||
_mm_store_ss(&src_value, src);
|
_mm_store_ss(&src_value, src);
|
||||||
float result = std::log2(src_value);
|
float result = std::log2(src_value);
|
||||||
|
@ -2417,7 +2417,7 @@ struct LOG2_F32 : Sequence<LOG2_F32, I<OPCODE_LOG2, F32Op, F32Op>> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
|
struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
|
||||||
static __m128d EmulateLog2(void*, __m128d src) {
|
static __m128d EmulateLog2(void*, __m128d& src) {
|
||||||
double src_value;
|
double src_value;
|
||||||
_mm_store_sd(&src_value, src);
|
_mm_store_sd(&src_value, src);
|
||||||
double result = std::log2(src_value);
|
double result = std::log2(src_value);
|
||||||
|
@ -2431,7 +2431,7 @@ struct LOG2_F64 : Sequence<LOG2_F64, I<OPCODE_LOG2, F64Op, F64Op>> {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
|
struct LOG2_V128 : Sequence<LOG2_V128, I<OPCODE_LOG2, V128Op, V128Op>> {
|
||||||
static __m128 EmulateLog2(void*, __m128 src) {
|
static __m128 EmulateLog2(void*, __m128& src) {
|
||||||
alignas(16) float values[4];
|
alignas(16) float values[4];
|
||||||
_mm_store_ps(values, src);
|
_mm_store_ps(values, src);
|
||||||
for (size_t i = 0; i < 4; ++i) {
|
for (size_t i = 0; i < 4; ++i) {
|
||||||
|
@ -2713,7 +2713,7 @@ struct SHL_V128 : Sequence<SHL_V128, I<OPCODE_SHL, V128Op, V128Op, I8Op>> {
|
||||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShlV128));
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShlV128));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
static __m128i EmulateShlV128(void*, __m128i src1, uint8_t src2) {
|
static __m128i EmulateShlV128(void*, __m128i& src1, uint8_t src2) {
|
||||||
// Almost all instances are shamt = 1, but non-constant.
|
// Almost all instances are shamt = 1, but non-constant.
|
||||||
// shamt is [0,7]
|
// shamt is [0,7]
|
||||||
uint8_t shamt = src2 & 0x7;
|
uint8_t shamt = src2 & 0x7;
|
||||||
|
@ -2790,7 +2790,7 @@ struct SHR_V128 : Sequence<SHR_V128, I<OPCODE_SHR, V128Op, V128Op, I8Op>> {
|
||||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateShrV128));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
static __m128i EmulateShrV128(void*, __m128i src1, uint8_t src2) {
|
static __m128i EmulateShrV128(void*, __m128i& src1, uint8_t src2) {
|
||||||
// Almost all instances are shamt = 1, but non-constant.
|
// Almost all instances are shamt = 1, but non-constant.
|
||||||
// shamt is [0,7]
|
// shamt is [0,7]
|
||||||
uint8_t shamt = src2 & 0x7;
|
uint8_t shamt = src2 & 0x7;
|
||||||
|
|
Loading…
Reference in New Issue