target/i386: Destructive vector helpers for AVX

These helpers need to take special care to avoid overwriting source values
before the wole result has been calculated.  Currently they use a dummy
Reg typed variable to store the result then assign the whole register.
This will cause 128 bit operations to corrupt the upper half of the register,
so replace it with explicit temporaries and element assignments.

Signed-off-by: Paul Brook <paul@nowt.org>
Message-Id: <20220424220204.2493824-14-paul@nowt.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
Paul Brook 2022-04-24 23:01:35 +01:00 committed by Paolo Bonzini
parent e894bae8cb
commit d45b0de63d
1 changed files with 262 additions and 294 deletions

View File

@ -41,6 +41,7 @@
#endif #endif
#define LANE_WIDTH (SHIFT ? 16 : 8) #define LANE_WIDTH (SHIFT ? 16 : 8)
#define PACK_WIDTH (LANE_WIDTH / 2)
/* /*
* Copy the relevant parts of a Reg value around. In the case where * Copy the relevant parts of a Reg value around. In the case where
@ -474,71 +475,81 @@ void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
} }
#endif #endif
#define SHUFFLE4(F, a, b, offset) do { \
r0 = a->F((order & 3) + offset); \
r1 = a->F(((order >> 2) & 3) + offset); \
r2 = b->F(((order >> 4) & 3) + offset); \
r3 = b->F(((order >> 6) & 3) + offset); \
d->F(offset) = r0; \
d->F(offset + 1) = r1; \
d->F(offset + 2) = r2; \
d->F(offset + 3) = r3; \
} while (0)
#if SHIFT == 0 #if SHIFT == 0
void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
{ {
Reg r; uint16_t r0, r1, r2, r3;
r.W(0) = s->W(order & 3); SHUFFLE4(W, s, s, 0);
r.W(1) = s->W((order >> 2) & 3);
r.W(2) = s->W((order >> 4) & 3);
r.W(3) = s->W((order >> 6) & 3);
MOVE(*d, r);
} }
#else #else
void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order) void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
{ {
Reg r; Reg *v = d;
uint32_t r0, r1, r2, r3;
int i;
r.L(0) = d->L(order & 3); for (i = 0; i < 2 << SHIFT; i += 4) {
r.L(1) = d->L((order >> 2) & 3); SHUFFLE4(L, v, s, i);
r.L(2) = s->L((order >> 4) & 3); }
r.L(3) = s->L((order >> 6) & 3);
MOVE(*d, r);
} }
void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order) void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
{ {
Reg r; Reg *v = d;
uint64_t r0, r1;
int i;
r.Q(0) = d->Q(order & 1); for (i = 0; i < 1 << SHIFT; i += 2) {
r.Q(1) = s->Q((order >> 1) & 1); r0 = v->Q(((order & 1) & 1) + i);
MOVE(*d, r); r1 = s->Q(((order >> 1) & 1) + i);
d->Q(i) = r0;
d->Q(i + 1) = r1;
order >>= 2;
}
} }
void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
{ {
Reg r; uint32_t r0, r1, r2, r3;
int i;
r.L(0) = s->L(order & 3); for (i = 0; i < 2 << SHIFT; i += 4) {
r.L(1) = s->L((order >> 2) & 3); SHUFFLE4(L, s, s, i);
r.L(2) = s->L((order >> 4) & 3); }
r.L(3) = s->L((order >> 6) & 3);
MOVE(*d, r);
} }
void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
{ {
Reg r; uint16_t r0, r1, r2, r3;
int i, j;
r.W(0) = s->W(order & 3); for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
r.W(1) = s->W((order >> 2) & 3); SHUFFLE4(W, s, s, i);
r.W(2) = s->W((order >> 4) & 3); d->Q(j) = s->Q(j);
r.W(3) = s->W((order >> 6) & 3); }
r.Q(1) = s->Q(1);
MOVE(*d, r);
} }
void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
{ {
Reg r; uint16_t r0, r1, r2, r3;
int i, j;
r.Q(0) = s->Q(0); for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
r.W(4) = s->W(4 + (order & 3)); d->Q(j) = s->Q(j);
r.W(5) = s->W(4 + ((order >> 2) & 3)); SHUFFLE4(W, s, s, i);
r.W(6) = s->W(4 + ((order >> 4) & 3)); }
r.W(7) = s->W(4 + ((order >> 6) & 3));
MOVE(*d, r);
} }
#endif #endif
@ -1091,156 +1102,132 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
return val; return val;
} }
void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #define PACK_HELPER_B(name, F) \
{ void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
Reg r; Reg *d, Reg *s) \
{ \
r.B(0) = satsb((int16_t)d->W(0)); Reg *v = d; \
r.B(1) = satsb((int16_t)d->W(1)); uint8_t r[PACK_WIDTH * 2]; \
r.B(2) = satsb((int16_t)d->W(2)); int j, k; \
r.B(3) = satsb((int16_t)d->W(3)); for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
#if SHIFT == 1 for (k = 0; k < PACK_WIDTH; k++) { \
r.B(4) = satsb((int16_t)d->W(4)); r[k] = F((int16_t)v->W(j + k)); \
r.B(5) = satsb((int16_t)d->W(5)); } \
r.B(6) = satsb((int16_t)d->W(6)); for (k = 0; k < PACK_WIDTH; k++) { \
r.B(7) = satsb((int16_t)d->W(7)); r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \
#endif } \
r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0)); for (k = 0; k < PACK_WIDTH * 2; k++) { \
r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1)); d->B(2 * j + k) = r[k]; \
r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2)); } \
r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3)); } \
#if SHIFT == 1
r.B(12) = satsb((int16_t)s->W(4));
r.B(13) = satsb((int16_t)s->W(5));
r.B(14) = satsb((int16_t)s->W(6));
r.B(15) = satsb((int16_t)s->W(7));
#endif
MOVE(*d, r);
} }
void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) PACK_HELPER_B(sswb, satsb)
{ PACK_HELPER_B(uswb, satub)
Reg r;
r.B(0) = satub((int16_t)d->W(0));
r.B(1) = satub((int16_t)d->W(1));
r.B(2) = satub((int16_t)d->W(2));
r.B(3) = satub((int16_t)d->W(3));
#if SHIFT == 1
r.B(4) = satub((int16_t)d->W(4));
r.B(5) = satub((int16_t)d->W(5));
r.B(6) = satub((int16_t)d->W(6));
r.B(7) = satub((int16_t)d->W(7));
#endif
r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));
r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1));
r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2));
r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3));
#if SHIFT == 1
r.B(12) = satub((int16_t)s->W(4));
r.B(13) = satub((int16_t)s->W(5));
r.B(14) = satub((int16_t)s->W(6));
r.B(15) = satub((int16_t)s->W(7));
#endif
MOVE(*d, r);
}
void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{ {
Reg r; Reg *v = d;
uint16_t r[PACK_WIDTH];
int j, k;
r.W(0) = satsw(d->L(0)); for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
r.W(1) = satsw(d->L(1)); for (k = 0; k < PACK_WIDTH / 2; k++) {
#if SHIFT == 1 r[k] = satsw(v->L(j + k));
r.W(2) = satsw(d->L(2)); }
r.W(3) = satsw(d->L(3)); for (k = 0; k < PACK_WIDTH / 2; k++) {
#endif r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
r.W((2 << SHIFT) + 0) = satsw(s->L(0)); }
r.W((2 << SHIFT) + 1) = satsw(s->L(1)); for (k = 0; k < PACK_WIDTH; k++) {
#if SHIFT == 1 d->W(2 * j + k) = r[k];
r.W(6) = satsw(s->L(2)); }
r.W(7) = satsw(s->L(3)); }
#endif
MOVE(*d, r);
} }
#define UNPCK_OP(base_name, base) \ #define UNPCK_OP(base_name, base) \
\ \
void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
Reg *d, Reg *s) \ Reg *d, Reg *s) \
{ \ { \
Reg r; \ Reg *v = d; \
uint8_t r[PACK_WIDTH * 2]; \
int j, i; \
\ \
r.B(0) = d->B((base << (SHIFT + 2)) + 0); \ for (j = 0; j < 8 << SHIFT; ) { \
r.B(1) = s->B((base << (SHIFT + 2)) + 0); \ int k = j + base * PACK_WIDTH; \
r.B(2) = d->B((base << (SHIFT + 2)) + 1); \ for (i = 0; i < PACK_WIDTH; i++) { \
r.B(3) = s->B((base << (SHIFT + 2)) + 1); \ r[2 * i] = v->B(k + i); \
r.B(4) = d->B((base << (SHIFT + 2)) + 2); \ r[2 * i + 1] = s->B(k + i); \
r.B(5) = s->B((base << (SHIFT + 2)) + 2); \ } \
r.B(6) = d->B((base << (SHIFT + 2)) + 3); \ for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \
r.B(7) = s->B((base << (SHIFT + 2)) + 3); \ d->B(j) = r[i]; \
XMM_ONLY( \ } \
r.B(8) = d->B((base << (SHIFT + 2)) + 4); \ } \
r.B(9) = s->B((base << (SHIFT + 2)) + 4); \
r.B(10) = d->B((base << (SHIFT + 2)) + 5); \
r.B(11) = s->B((base << (SHIFT + 2)) + 5); \
r.B(12) = d->B((base << (SHIFT + 2)) + 6); \
r.B(13) = s->B((base << (SHIFT + 2)) + 6); \
r.B(14) = d->B((base << (SHIFT + 2)) + 7); \
r.B(15) = s->B((base << (SHIFT + 2)) + 7); \
) \
MOVE(*d, r); \
} \ } \
\ \
void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
Reg *d, Reg *s) \ Reg *d, Reg *s) \
{ \ { \
Reg r; \ Reg *v = d; \
uint16_t r[PACK_WIDTH]; \
int j, i; \
\ \
r.W(0) = d->W((base << (SHIFT + 1)) + 0); \ for (j = 0; j < 4 << SHIFT; ) { \
r.W(1) = s->W((base << (SHIFT + 1)) + 0); \ int k = j + base * PACK_WIDTH / 2; \
r.W(2) = d->W((base << (SHIFT + 1)) + 1); \ for (i = 0; i < PACK_WIDTH / 2; i++) { \
r.W(3) = s->W((base << (SHIFT + 1)) + 1); \ r[2 * i] = v->W(k + i); \
XMM_ONLY( \ r[2 * i + 1] = s->W(k + i); \
r.W(4) = d->W((base << (SHIFT + 1)) + 2); \ } \
r.W(5) = s->W((base << (SHIFT + 1)) + 2); \ for (i = 0; i < PACK_WIDTH; i++, j++) { \
r.W(6) = d->W((base << (SHIFT + 1)) + 3); \ d->W(j) = r[i]; \
r.W(7) = s->W((base << (SHIFT + 1)) + 3); \ } \
) \ } \
MOVE(*d, r); \
} \ } \
\ \
void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
Reg *d, Reg *s) \ Reg *d, Reg *s) \
{ \ { \
Reg r; \ Reg *v = d; \
uint32_t r[PACK_WIDTH / 2]; \
int j, i; \
\ \
r.L(0) = d->L((base << SHIFT) + 0); \ for (j = 0; j < 2 << SHIFT; ) { \
r.L(1) = s->L((base << SHIFT) + 0); \ int k = j + base * PACK_WIDTH / 4; \
XMM_ONLY( \ for (i = 0; i < PACK_WIDTH / 4; i++) { \
r.L(2) = d->L((base << SHIFT) + 1); \ r[2 * i] = v->L(k + i); \
r.L(3) = s->L((base << SHIFT) + 1); \ r[2 * i + 1] = s->L(k + i); \
) \ } \
MOVE(*d, r); \ for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \
d->L(j) = r[i]; \
} \
} \
} \ } \
\ \
XMM_ONLY( \ XMM_ONLY( \
void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86State \ void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \
*env, \ CPUX86State *env, Reg *d, Reg *s) \
Reg *d, \
Reg *s) \
{ \ { \
Reg r; \ Reg *v = d; \
uint64_t r[2]; \
int i; \
\ \
r.Q(0) = d->Q(base); \ for (i = 0; i < 1 << SHIFT; i += 2) { \
r.Q(1) = s->Q(base); \ r[0] = v->Q(base + i); \
MOVE(*d, r); \ r[1] = s->Q(base + i); \
d->Q(i) = r[0]; \
d->Q(i + 1) = r[1]; \
} \
} \ } \
) )
UNPCK_OP(l, 0) UNPCK_OP(l, 0)
UNPCK_OP(h, 1) UNPCK_OP(h, 1)
#undef PACK_WIDTH
#undef PACK_HELPER_B
#undef UNPCK_OP
/* 3DNow! float ops */ /* 3DNow! float ops */
#if SHIFT == 0 #if SHIFT == 0
void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s) void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
@ -1393,122 +1380,86 @@ void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
/* SSSE3 op helpers */ /* SSSE3 op helpers */
void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{ {
Reg *v = d;
int i; int i;
Reg r; #if SHIFT == 0
uint8_t r[8];
for (i = 0; i < (8 << SHIFT); i++) { for (i = 0; i < 8; i++) {
r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1))); r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
} }
for (i = 0; i < 8; i++) {
d->B(i) = r[i];
}
#else
uint8_t r[8 << SHIFT];
MOVE(*d, r); for (i = 0; i < 8 << SHIFT; i++) {
} int j = i & ~0xf;
r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) }
{ for (i = 0; i < 8 << SHIFT; i++) {
d->B(i) = r[i];
Reg r; }
r.W(0) = (int16_t)d->W(0) + (int16_t)d->W(1);
r.W(1) = (int16_t)d->W(2) + (int16_t)d->W(3);
XMM_ONLY(r.W(2) = (int16_t)d->W(4) + (int16_t)d->W(5));
XMM_ONLY(r.W(3) = (int16_t)d->W(6) + (int16_t)d->W(7));
r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1);
r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3);
XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5));
XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7));
MOVE(*d, r);
}
void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
Reg r;
r.L(0) = (int32_t)d->L(0) + (int32_t)d->L(1);
XMM_ONLY(r.L(1) = (int32_t)d->L(2) + (int32_t)d->L(3));
r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1);
XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3));
MOVE(*d, r);
}
void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
Reg r;
r.W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1));
r.W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3));
XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5)));
XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7)));
r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1));
r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3));
XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5)));
XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7)));
MOVE(*d, r);
}
void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{
d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) +
(int8_t)s->B(1) * (uint8_t)d->B(1));
d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) +
(int8_t)s->B(3) * (uint8_t)d->B(3));
d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) +
(int8_t)s->B(5) * (uint8_t)d->B(5));
d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) +
(int8_t)s->B(7) * (uint8_t)d->B(7));
#if SHIFT == 1
d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) +
(int8_t)s->B(9) * (uint8_t)d->B(9));
d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) +
(int8_t)s->B(11) * (uint8_t)d->B(11));
d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) +
(int8_t)s->B(13) * (uint8_t)d->B(13));
d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) +
(int8_t)s->B(15) * (uint8_t)d->B(15));
#endif #endif
} }
void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #define SSE_HELPER_HW(name, F) \
{ void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
Reg r; { \
Reg *v = d; \
r.W(0) = (int16_t)d->W(0) - (int16_t)d->W(1); uint16_t r[4 << SHIFT]; \
r.W(1) = (int16_t)d->W(2) - (int16_t)d->W(3); int i, j, k; \
XMM_ONLY(r.W(2) = (int16_t)d->W(4) - (int16_t)d->W(5)); for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \
XMM_ONLY(r.W(3) = (int16_t)d->W(6) - (int16_t)d->W(7)); for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1); r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \
r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3); } \
XMM_ONLY(r.W(6) = (int16_t)s->W(4) - (int16_t)s->W(5)); for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
XMM_ONLY(r.W(7) = (int16_t)s->W(6) - (int16_t)s->W(7)); r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \
MOVE(*d, r); } \
} \
for (i = 0; i < 4 << SHIFT; i++) { \
d->W(i) = r[i]; \
} \
} }
void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) #define SSE_HELPER_HL(name, F) \
{ void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
Reg r; { \
Reg *v = d; \
r.L(0) = (int32_t)d->L(0) - (int32_t)d->L(1); uint32_t r[2 << SHIFT]; \
XMM_ONLY(r.L(1) = (int32_t)d->L(2) - (int32_t)d->L(3)); int i, j, k; \
r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1); for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
XMM_ONLY(r.L(3) = (int32_t)s->L(2) - (int32_t)s->L(3)); for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
MOVE(*d, r); r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \
} \
for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \
} \
} \
for (i = 0; i < 2 << SHIFT; i++) { \
d->L(i) = r[i]; \
} \
} }
void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) SSE_HELPER_HW(phaddw, FADD)
{ SSE_HELPER_HW(phsubw, FSUB)
Reg r; SSE_HELPER_HW(phaddsw, FADDSW)
SSE_HELPER_HW(phsubsw, FSUBSW)
SSE_HELPER_HL(phaddd, FADD)
SSE_HELPER_HL(phsubd, FSUB)
r.W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1)); #undef SSE_HELPER_HW
r.W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3)); #undef SSE_HELPER_HL
XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5)));
XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7))); void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1)); {
r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3)); Reg *v = d;
XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5))); int i;
XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7))); for (i = 0; i < 4 << SHIFT; i++) {
MOVE(*d, r); d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
(int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
}
} }
#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x) #define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x)
@ -1531,32 +1482,38 @@ SSE_HELPER_L(helper_psignd, FSIGNL)
void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
int32_t shift) int32_t shift)
{ {
Reg r; Reg *v = d;
int i;
/* XXX could be checked during translation */ /* XXX could be checked during translation */
if (shift >= (16 << SHIFT)) { if (shift >= (SHIFT ? 32 : 16)) {
r.Q(0) = 0; for (i = 0; i < (1 << SHIFT); i++) {
XMM_ONLY(r.Q(1) = 0); d->Q(i) = 0;
}
} else { } else {
shift <<= 3; shift <<= 3;
#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
#if SHIFT == 0 #if SHIFT == 0
r.Q(0) = SHR(s->Q(0), shift - 0) | d->Q(0) = SHR(s->Q(0), shift - 0) |
SHR(d->Q(0), shift - 64); SHR(v->Q(0), shift - 64);
#else #else
r.Q(0) = SHR(s->Q(0), shift - 0) | for (i = 0; i < (1 << SHIFT); i += 2) {
SHR(s->Q(1), shift - 64) | uint64_t r0, r1;
SHR(d->Q(0), shift - 128) |
SHR(d->Q(1), shift - 192); r0 = SHR(s->Q(i), shift - 0) |
r.Q(1) = SHR(s->Q(0), shift + 64) | SHR(s->Q(i + 1), shift - 64) |
SHR(s->Q(1), shift - 0) | SHR(v->Q(i), shift - 128) |
SHR(d->Q(0), shift - 64) | SHR(v->Q(i + 1), shift - 192);
SHR(d->Q(1), shift - 128); r1 = SHR(s->Q(i), shift + 64) |
SHR(s->Q(i + 1), shift - 0) |
SHR(v->Q(i), shift - 64) |
SHR(v->Q(i + 1), shift - 128);
d->Q(i) = r0;
d->Q(i + 1) = r1;
}
#endif #endif
#undef SHR #undef SHR
} }
MOVE(*d, r);
} }
#define XMM0 (env->xmm_regs[0]) #define XMM0 (env->xmm_regs[0])
@ -1681,17 +1638,23 @@ SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
{ {
Reg r; Reg *v = d;
uint16_t r[8];
int i, j, k;
r.W(0) = satuw((int32_t) d->L(0)); for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
r.W(1) = satuw((int32_t) d->L(1)); r[0] = satuw(v->L(j));
r.W(2) = satuw((int32_t) d->L(2)); r[1] = satuw(v->L(j + 1));
r.W(3) = satuw((int32_t) d->L(3)); r[2] = satuw(v->L(j + 2));
r.W(4) = satuw((int32_t) s->L(0)); r[3] = satuw(v->L(j + 3));
r.W(5) = satuw((int32_t) s->L(1)); r[4] = satuw(s->L(j));
r.W(6) = satuw((int32_t) s->L(2)); r[5] = satuw(s->L(j + 1));
r.W(7) = satuw((int32_t) s->L(3)); r[6] = satuw(s->L(j + 2));
MOVE(*d, r); r[7] = satuw(s->L(j + 3));
for (k = 0; k < 8; k++) {
d->W(i + k) = r[k];
}
}
} }
#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s) #define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
@ -1947,20 +1910,25 @@ void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
uint32_t offset) uint32_t offset)
{ {
int s0 = (offset & 3) << 2; Reg *v = d;
int d0 = (offset & 4) << 0; int i, j;
int i; uint16_t r[8];
Reg r;
for (i = 0; i < 8; i++, d0++) { for (j = 0; j < 4 << SHIFT; ) {
r.W(i) = 0; int s0 = (j * 2) + ((offset & 3) << 2);
r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); int d0 = (j * 2) + ((offset & 4) << 0);
r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); r[i] = 0;
r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
}
for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
d->W(j) = r[i];
}
offset >>= 3;
} }
MOVE(*d, r);
} }
/* SSE4.2 op helpers */ /* SSE4.2 op helpers */