mirror of https://github.com/PCSX2/pcsx2.git
USB: restore formatting of standalone third party code
This commit is contained in:
parent
9da3d9a5bf
commit
ede7fa86fa
|
@ -35,163 +35,38 @@
|
|||
#include "jo_mpeg.h"
|
||||
|
||||
// Huffman tables
|
||||
static const unsigned char s_jo_HTDC_Y[9][2] = {{4, 3}, {0, 2}, {1, 2}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}};
|
||||
static const unsigned char s_jo_HTDC_C[9][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}};
|
||||
static const unsigned char s_jo_HTDC_Y[9][2] = {{4,3}, {0,2}, {1,2}, {5,3}, {6,3}, {14,4}, {30,5}, {62,6}, {126,7}};
|
||||
static const unsigned char s_jo_HTDC_C[9][2] = {{0,2}, {1,2}, {2,2}, {6,3}, {14,4}, {30,5}, {62,6}, {126,7}, {254,8}};
|
||||
static const unsigned char s_jo_HTAC[32][40][2] = {
|
||||
{
|
||||
{6, 3},
|
||||
{8, 5},
|
||||
{10, 6},
|
||||
{12, 8},
|
||||
{76, 9},
|
||||
{66, 9},
|
||||
{20, 11},
|
||||
{58, 13},
|
||||
{48, 13},
|
||||
{38, 13},
|
||||
{32, 13},
|
||||
{52, 14},
|
||||
{50, 14},
|
||||
{48, 14},
|
||||
{46, 14},
|
||||
{62, 15},
|
||||
{62, 15},
|
||||
{58, 15},
|
||||
{56, 15},
|
||||
{54, 15},
|
||||
{52, 15},
|
||||
{50, 15},
|
||||
{48, 15},
|
||||
{46, 15},
|
||||
{44, 15},
|
||||
{42, 15},
|
||||
{40, 15},
|
||||
{38, 15},
|
||||
{36, 15},
|
||||
{34, 15},
|
||||
{32, 15},
|
||||
{48, 16},
|
||||
{46, 16},
|
||||
{44, 16},
|
||||
{42, 16},
|
||||
{40, 16},
|
||||
{38, 16},
|
||||
{36, 16},
|
||||
{34, 16},
|
||||
{32, 16},
|
||||
},
|
||||
{{6, 4}, {12, 7}, {74, 9}, {24, 11}, {54, 13}, {44, 14}, {42, 14}, {62, 16}, {60, 16}, {58, 16}, {56, 16}, {54, 16}, {52, 16}, {50, 16}, {38, 17}, {36, 17}, {34, 17}, {32, 17}},
|
||||
{{10, 5}, {8, 8}, {22, 11}, {40, 13}, {40, 14}},
|
||||
{{14, 6}, {72, 9}, {56, 13}, {38, 14}},
|
||||
{{12, 6}, {30, 11}, {36, 13}},
|
||||
{{14, 7}, {18, 11}, {36, 14}},
|
||||
{{10, 7}, {60, 13}, {40, 17}},
|
||||
{{8, 7}, {42, 13}},
|
||||
{{14, 8}, {34, 13}},
|
||||
{{10, 8}, {34, 14}},
|
||||
{{78, 9}, {32, 14}},
|
||||
{{70, 9}, {52, 17}},
|
||||
{{68, 9}, {50, 17}},
|
||||
{{64, 9}, {48, 17}},
|
||||
{{28, 11}, {46, 17}},
|
||||
{{26, 11}, {44, 17}},
|
||||
{{16, 11}, {42, 17}},
|
||||
{{62, 13}},
|
||||
{{52, 13}},
|
||||
{{50, 13}},
|
||||
{{46, 13}},
|
||||
{{44, 13}},
|
||||
{{62, 14}},
|
||||
{{60, 14}},
|
||||
{{58, 14}},
|
||||
{{56, 14}},
|
||||
{{54, 14}},
|
||||
{{62, 17}},
|
||||
{{60, 17}},
|
||||
{{58, 17}},
|
||||
{{56, 17}},
|
||||
{{54, 17}},
|
||||
{{6,3},{8,5},{10,6},{12,8},{76,9},{66,9},{20,11},{58,13},{48,13},{38,13},{32,13},{52,14},{50,14},{48,14},{46,14},{62,15},{62,15},{58,15},{56,15},{54,15},{52,15},{50,15},{48,15},{46,15},{44,15},{42,15},{40,15},{38,15},{36,15},{34,15},{32,15},{48,16},{46,16},{44,16},{42,16},{40,16},{38,16},{36,16},{34,16},{32,16},},
|
||||
{{6,4},{12,7},{74,9},{24,11},{54,13},{44,14},{42,14},{62,16},{60,16},{58,16},{56,16},{54,16},{52,16},{50,16},{38,17},{36,17},{34,17},{32,17}},
|
||||
{{10,5},{8,8},{22,11},{40,13},{40,14}},
|
||||
{{14,6},{72,9},{56,13},{38,14}},
|
||||
{{12,6},{30,11},{36,13}}, {{14,7},{18,11},{36,14}}, {{10,7},{60,13},{40,17}},
|
||||
{{8,7},{42,13}}, {{14,8},{34,13}}, {{10,8},{34,14}}, {{78,9},{32,14}}, {{70,9},{52,17}}, {{68,9},{50,17}}, {{64,9},{48,17}}, {{28,11},{46,17}}, {{26,11},{44,17}}, {{16,11},{42,17}},
|
||||
{{62,13}}, {{52,13}}, {{50,13}}, {{46,13}}, {{44,13}}, {{62,14}}, {{60,14}}, {{58,14}}, {{56,14}}, {{54,14}}, {{62,17}}, {{60,17}}, {{58,17}}, {{56,17}}, {{54,17}},
|
||||
};
|
||||
static const float s_jo_quantTbl[64] = {
|
||||
0.015625f,
|
||||
0.005632f,
|
||||
0.005035f,
|
||||
0.004832f,
|
||||
0.004808f,
|
||||
0.005892f,
|
||||
0.007964f,
|
||||
0.013325f,
|
||||
0.005632f,
|
||||
0.004061f,
|
||||
0.003135f,
|
||||
0.003193f,
|
||||
0.003338f,
|
||||
0.003955f,
|
||||
0.004898f,
|
||||
0.008828f,
|
||||
0.005035f,
|
||||
0.003135f,
|
||||
0.002816f,
|
||||
0.003013f,
|
||||
0.003299f,
|
||||
0.003581f,
|
||||
0.005199f,
|
||||
0.009125f,
|
||||
0.004832f,
|
||||
0.003484f,
|
||||
0.003129f,
|
||||
0.003348f,
|
||||
0.003666f,
|
||||
0.003979f,
|
||||
0.005309f,
|
||||
0.009632f,
|
||||
0.005682f,
|
||||
0.003466f,
|
||||
0.003543f,
|
||||
0.003666f,
|
||||
0.003906f,
|
||||
0.004546f,
|
||||
0.005774f,
|
||||
0.009439f,
|
||||
0.006119f,
|
||||
0.004248f,
|
||||
0.004199f,
|
||||
0.004228f,
|
||||
0.004546f,
|
||||
0.005062f,
|
||||
0.006124f,
|
||||
0.009942f,
|
||||
0.008883f,
|
||||
0.006167f,
|
||||
0.006096f,
|
||||
0.005777f,
|
||||
0.006078f,
|
||||
0.006391f,
|
||||
0.007621f,
|
||||
0.012133f,
|
||||
0.016780f,
|
||||
0.011263f,
|
||||
0.009907f,
|
||||
0.010139f,
|
||||
0.009849f,
|
||||
0.010297f,
|
||||
0.012133f,
|
||||
0.019785f,
|
||||
0.015625f,0.005632f,0.005035f,0.004832f,0.004808f,0.005892f,0.007964f,0.013325f,
|
||||
0.005632f,0.004061f,0.003135f,0.003193f,0.003338f,0.003955f,0.004898f,0.008828f,
|
||||
0.005035f,0.003135f,0.002816f,0.003013f,0.003299f,0.003581f,0.005199f,0.009125f,
|
||||
0.004832f,0.003484f,0.003129f,0.003348f,0.003666f,0.003979f,0.005309f,0.009632f,
|
||||
0.005682f,0.003466f,0.003543f,0.003666f,0.003906f,0.004546f,0.005774f,0.009439f,
|
||||
0.006119f,0.004248f,0.004199f,0.004228f,0.004546f,0.005062f,0.006124f,0.009942f,
|
||||
0.008883f,0.006167f,0.006096f,0.005777f,0.006078f,0.006391f,0.007621f,0.012133f,
|
||||
0.016780f,0.011263f,0.009907f,0.010139f,0.009849f,0.010297f,0.012133f,0.019785f,
|
||||
};
|
||||
static const unsigned char s_jo_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63};
|
||||
static const unsigned char s_jo_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned char* buf_ptr;
|
||||
typedef struct {
|
||||
unsigned char *buf_ptr;
|
||||
int buf, cnt;
|
||||
} jo_bits_t;
|
||||
|
||||
static void jo_writeBits(jo_bits_t* b, int value, int count)
|
||||
{
|
||||
static void jo_writeBits(jo_bits_t *b, int value, int count) {
|
||||
b->cnt += count;
|
||||
b->buf |= value << (24 - b->cnt);
|
||||
while (b->cnt >= 8)
|
||||
{
|
||||
while(b->cnt >= 8) {
|
||||
unsigned char c = (b->buf >> 16) & 255;
|
||||
//putc(c, b->fp);
|
||||
*(b->buf_ptr) = c & 0xff;
|
||||
|
@ -201,8 +76,7 @@ static void jo_writeBits(jo_bits_t* b, int value, int count)
|
|||
}
|
||||
}
|
||||
|
||||
static void jo_DCT(float* d0, float* d1, float* d2, float* d3, float* d4, float* d5, float* d6, float* d7)
|
||||
{
|
||||
static void jo_DCT(float *d0, float *d1, float *d2, float *d3, float *d4, float *d5, float *d6, float *d7) {
|
||||
float tmp0 = *d0 + *d7;
|
||||
float tmp7 = *d0 - *d7;
|
||||
float tmp1 = *d1 + *d6;
|
||||
|
@ -213,52 +87,48 @@ static void jo_DCT(float* d0, float* d1, float* d2, float* d3, float* d4, float*
|
|||
float tmp4 = *d3 - *d4;
|
||||
|
||||
// Even part
|
||||
float tmp10 = tmp0 + tmp3; // phase 2
|
||||
float tmp10 = tmp0 + tmp3; // phase 2
|
||||
float tmp13 = tmp0 - tmp3;
|
||||
float tmp11 = tmp1 + tmp2;
|
||||
float tmp12 = tmp1 - tmp2;
|
||||
|
||||
*d0 = tmp10 + tmp11; // phase 3
|
||||
*d0 = tmp10 + tmp11; // phase 3
|
||||
*d4 = tmp10 - tmp11;
|
||||
|
||||
float z1 = (tmp12 + tmp13) * 0.707106781f; // c4
|
||||
*d2 = tmp13 + z1; // phase 5
|
||||
*d2 = tmp13 + z1; // phase 5
|
||||
*d6 = tmp13 - z1;
|
||||
|
||||
// Odd part
|
||||
tmp10 = tmp4 + tmp5; // phase 2
|
||||
tmp10 = tmp4 + tmp5; // phase 2
|
||||
tmp11 = tmp5 + tmp6;
|
||||
tmp12 = tmp6 + tmp7;
|
||||
|
||||
// The rotator is modified from fig 4-8 to avoid extra negations.
|
||||
float z5 = (tmp10 - tmp12) * 0.382683433f; // c6
|
||||
float z2 = tmp10 * 0.541196100f + z5; // c2-c6
|
||||
float z4 = tmp12 * 1.306562965f + z5; // c2+c6
|
||||
float z3 = tmp11 * 0.707106781f; // c4
|
||||
float z2 = tmp10 * 0.541196100f + z5; // c2-c6
|
||||
float z4 = tmp12 * 1.306562965f + z5; // c2+c6
|
||||
float z3 = tmp11 * 0.707106781f; // c4
|
||||
|
||||
float z11 = tmp7 + z3; // phase 5
|
||||
float z11 = tmp7 + z3; // phase 5
|
||||
float z13 = tmp7 - z3;
|
||||
|
||||
*d5 = z13 + z2; // phase 6
|
||||
*d5 = z13 + z2; // phase 6
|
||||
*d3 = z13 - z2;
|
||||
*d1 = z11 + z4;
|
||||
*d7 = z11 - z4;
|
||||
}
|
||||
|
||||
static int jo_processDU(jo_bits_t* bits, float A[64], const unsigned char htdc[9][2], int DC)
|
||||
{
|
||||
for (int dataOff = 0; dataOff < 64; dataOff += 8)
|
||||
{
|
||||
jo_DCT(&A[dataOff], &A[dataOff + 1], &A[dataOff + 2], &A[dataOff + 3], &A[dataOff + 4], &A[dataOff + 5], &A[dataOff + 6], &A[dataOff + 7]);
|
||||
static int jo_processDU(jo_bits_t *bits, float A[64], const unsigned char htdc[9][2], int DC) {
|
||||
for(int dataOff=0; dataOff<64; dataOff+=8) {
|
||||
jo_DCT(&A[dataOff], &A[dataOff+1], &A[dataOff+2], &A[dataOff+3], &A[dataOff+4], &A[dataOff+5], &A[dataOff+6], &A[dataOff+7]);
|
||||
}
|
||||
for (int dataOff = 0; dataOff < 8; ++dataOff)
|
||||
{
|
||||
jo_DCT(&A[dataOff], &A[dataOff + 8], &A[dataOff + 16], &A[dataOff + 24], &A[dataOff + 32], &A[dataOff + 40], &A[dataOff + 48], &A[dataOff + 56]);
|
||||
for(int dataOff=0; dataOff<8; ++dataOff) {
|
||||
jo_DCT(&A[dataOff], &A[dataOff+8], &A[dataOff+16], &A[dataOff+24], &A[dataOff+32], &A[dataOff+40], &A[dataOff+48], &A[dataOff+56]);
|
||||
}
|
||||
int Q[64];
|
||||
for (int i = 0; i < 64; ++i)
|
||||
{
|
||||
float v = A[i] * s_jo_quantTbl[i];
|
||||
for(int i=0; i<64; ++i) {
|
||||
float v = A[i]*s_jo_quantTbl[i];
|
||||
Q[s_jo_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
|
||||
}
|
||||
|
||||
|
@ -266,48 +136,36 @@ static int jo_processDU(jo_bits_t* bits, float A[64], const unsigned char htdc[9
|
|||
int aDC = DC < 0 ? -DC : DC;
|
||||
int size = 0;
|
||||
int tempval = aDC;
|
||||
while (tempval)
|
||||
{
|
||||
while(tempval) {
|
||||
size++;
|
||||
tempval >>= 1;
|
||||
}
|
||||
jo_writeBits(bits, htdc[size][0], htdc[size][1]);
|
||||
if (DC < 0)
|
||||
aDC ^= (1 << size) - 1;
|
||||
if(DC < 0) aDC ^= (1 << size) - 1;
|
||||
jo_writeBits(bits, aDC, size);
|
||||
|
||||
int endpos = 63;
|
||||
for (; (endpos > 0) && (Q[endpos] == 0); --endpos)
|
||||
{ /* do nothing */
|
||||
}
|
||||
for (int i = 1; i <= endpos;)
|
||||
{
|
||||
for(; (endpos>0)&&(Q[endpos]==0); --endpos) { /* do nothing */ }
|
||||
for(int i = 1; i <= endpos;) {
|
||||
int run = 0;
|
||||
while (Q[i] == 0 && i < endpos)
|
||||
{
|
||||
while (Q[i]==0 && i<endpos) {
|
||||
++run;
|
||||
++i;
|
||||
}
|
||||
int AC = Q[i++];
|
||||
int aAC = AC < 0 ? -AC : AC;
|
||||
int code = 0, size = 0;
|
||||
if (run < 32 && aAC <= 40)
|
||||
{
|
||||
code = s_jo_HTAC[run][aAC - 1][0];
|
||||
size = s_jo_HTAC[run][aAC - 1][1];
|
||||
if (AC < 0)
|
||||
code += 1;
|
||||
if (run<32 && aAC<=40) {
|
||||
code = s_jo_HTAC[run][aAC-1][0];
|
||||
size = s_jo_HTAC[run][aAC-1][1];
|
||||
if (AC < 0) code += 1;
|
||||
}
|
||||
if (!size)
|
||||
{
|
||||
if(!size) {
|
||||
jo_writeBits(bits, 1, 6);
|
||||
jo_writeBits(bits, run, 6);
|
||||
if (AC < -127)
|
||||
{
|
||||
if (AC < -127) {
|
||||
jo_writeBits(bits, 128, 12);
|
||||
}
|
||||
else if (AC > 127)
|
||||
{
|
||||
} else if(AC > 127) {
|
||||
jo_writeBits(bits, 0, 12);
|
||||
}
|
||||
code = AC & 0xFFF;
|
||||
|
@ -320,23 +178,17 @@ static int jo_processDU(jo_bits_t* bits, float A[64], const unsigned char htdc[9
|
|||
return Q[0];
|
||||
}
|
||||
|
||||
unsigned long jo_write_mpeg(unsigned char* mpeg_buf, const unsigned char* raw, int width, int height, int format, int flipx, int flipy)
|
||||
{
|
||||
unsigned long jo_write_mpeg(unsigned char *mpeg_buf, const unsigned char *raw, int width, int height, int format, int flipx, int flipy) {
|
||||
int lastDCY = 128, lastDCCR = 128, lastDCCB = 128;
|
||||
unsigned char* head = mpeg_buf;
|
||||
unsigned char *head = mpeg_buf;
|
||||
jo_bits_t bits = {mpeg_buf};
|
||||
|
||||
for (int vblock = 0; vblock < (height + 15) / 16; vblock++)
|
||||
{
|
||||
for (int hblock = 0; hblock < (width + 15) / 16; hblock++)
|
||||
{
|
||||
if (vblock == 0 && hblock == 0)
|
||||
{
|
||||
for (int vblock = 0; vblock < (height+15)/16; vblock++) {
|
||||
for (int hblock = 0; hblock < (width+15)/16; hblock++) {
|
||||
if (vblock == 0 && hblock == 0) {
|
||||
jo_writeBits(&bits, 0b01, 2); // macroblock_type = intra+quant
|
||||
jo_writeBits(&bits, 8, 5); // quantiser_scale_code = 8
|
||||
}
|
||||
else
|
||||
{
|
||||
jo_writeBits(&bits, 8, 5); // quantiser_scale_code = 8
|
||||
} else {
|
||||
jo_writeBits(&bits, 0b1, 1); // macroblock_address_increment
|
||||
jo_writeBits(&bits, 0b1, 1); // macroblock_type = intra
|
||||
}
|
||||
|
@ -344,113 +196,87 @@ unsigned long jo_write_mpeg(unsigned char* mpeg_buf, const unsigned char* raw, i
|
|||
float Y[256], CBx[256], CRx[256];
|
||||
float CB[64], CR[64];
|
||||
|
||||
if (format == JO_RGBX)
|
||||
{
|
||||
for (int i = 0; i < 256; ++i)
|
||||
{
|
||||
int y = vblock * 16 + (i / 16);
|
||||
int x = hblock * 16 + (i & 15);
|
||||
x = x >= width ? width - 1 : x;
|
||||
y = y >= height ? height - 1 : y;
|
||||
if (flipx)
|
||||
x = width - 1 - x;
|
||||
if (flipy)
|
||||
y = height - 1 - y;
|
||||
const unsigned char* c = raw + y * width * 4 + x * 4;
|
||||
if (format == JO_RGBX) {
|
||||
for (int i=0; i<256; ++i) {
|
||||
int y = vblock*16+(i/16);
|
||||
int x = hblock*16+(i&15);
|
||||
x = x >= width ? width-1 : x;
|
||||
y = y >= height ? height-1 : y;
|
||||
if (flipx) x = width - 1 - x;
|
||||
if (flipy) y = height - 1 - y;
|
||||
const unsigned char *c = raw + y*width*4+x*4;
|
||||
float r, g, b;
|
||||
if (flipx && flipy)
|
||||
{
|
||||
if (flipx && flipy) {
|
||||
r = c[2], g = c[1], b = c[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
r = c[0], g = c[1], b = c[2];
|
||||
}
|
||||
Y[i] = (0.299f * r + 0.587f * g + 0.114f * b) * (219.f / 255) + 16;
|
||||
CBx[i] = (-0.299f * r - 0.587f * g + 0.886f * b) * (224.f / 255) + 128;
|
||||
CRx[i] = (0.701f * r - 0.587f * g - 0.114f * b) * (224.f / 255) + 128;
|
||||
Y[i] = (0.299f*r + 0.587f*g + 0.114f*b) * (219.f/255) + 16;
|
||||
CBx[i] = (-0.299f*r - 0.587f*g + 0.886f*b) * (224.f/255) + 128;
|
||||
CRx[i] = (0.701f*r - 0.587f*g - 0.114f*b) * (224.f/255) + 128;
|
||||
}
|
||||
// Downsample Cb,Cr (420 format)
|
||||
for (int i = 0; i < 64; ++i)
|
||||
{
|
||||
int j = (i & 7) * 2 + (i & 56) * 4;
|
||||
CB[i] = (CBx[j] + CBx[j + 1] + CBx[j + 16] + CBx[j + 17]) * 0.25f;
|
||||
CR[i] = (CRx[j] + CRx[j + 1] + CRx[j + 16] + CRx[j + 17]) * 0.25f;
|
||||
for (int i=0; i<64; ++i) {
|
||||
int j =(i&7)*2 + (i&56)*4;
|
||||
CB[i] = (CBx[j] + CBx[j+1] + CBx[j+16] + CBx[j+17]) * 0.25f;
|
||||
CR[i] = (CRx[j] + CRx[j+1] + CRx[j+16] + CRx[j+17]) * 0.25f;
|
||||
}
|
||||
}
|
||||
else if (format == JO_RGB24)
|
||||
{
|
||||
for (int i = 0; i < 256; ++i)
|
||||
{
|
||||
int y = vblock * 16 + (i / 16);
|
||||
int x = hblock * 16 + (i & 15);
|
||||
x = x >= width ? width - 1 : x;
|
||||
y = y >= height ? height - 1 : y;
|
||||
if (flipx)
|
||||
x = width - 1 - x;
|
||||
if (flipy)
|
||||
y = height - 1 - y;
|
||||
const unsigned char* c = raw + y * width * 3 + x * 3;
|
||||
} else
|
||||
if (format == JO_RGB24) {
|
||||
for (int i=0; i<256; ++i) {
|
||||
int y = vblock*16+(i/16);
|
||||
int x = hblock*16+(i&15);
|
||||
x = x >= width ? width-1 : x;
|
||||
y = y >= height ? height-1 : y;
|
||||
if (flipx) x = width - 1 - x;
|
||||
if (flipy) y = height - 1 - y;
|
||||
const unsigned char *c = raw + y*width*3+x*3;
|
||||
float r, g, b;
|
||||
if (flipx && flipy)
|
||||
{
|
||||
if (flipx && flipy) {
|
||||
r = c[2], g = c[1], b = c[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
r = c[0], g = c[1], b = c[2];
|
||||
}
|
||||
Y[i] = (0.299f * r + 0.587f * g + 0.114f * b) * (219.f / 255) + 16;
|
||||
CBx[i] = (-0.299f * r - 0.587f * g + 0.886f * b) * (224.f / 255) + 128;
|
||||
CRx[i] = (0.701f * r - 0.587f * g - 0.114f * b) * (224.f / 255) + 128;
|
||||
Y[i] = (0.299f*r + 0.587f*g + 0.114f*b) * (219.f/255) + 16;
|
||||
CBx[i] = (-0.299f*r - 0.587f*g + 0.886f*b) * (224.f/255) + 128;
|
||||
CRx[i] = (0.701f*r - 0.587f*g - 0.114f*b) * (224.f/255) + 128;
|
||||
}
|
||||
// Downsample Cb,Cr (420 format)
|
||||
for (int i = 0; i < 64; ++i)
|
||||
{
|
||||
int j = (i & 7) * 2 + (i & 56) * 4;
|
||||
CB[i] = (CBx[j] + CBx[j + 1] + CBx[j + 16] + CBx[j + 17]) * 0.25f;
|
||||
CR[i] = (CRx[j] + CRx[j + 1] + CRx[j + 16] + CRx[j + 17]) * 0.25f;
|
||||
for (int i=0; i<64; ++i) {
|
||||
int j =(i&7)*2 + (i&56)*4;
|
||||
CB[i] = (CBx[j] + CBx[j+1] + CBx[j+16] + CBx[j+17]) * 0.25f;
|
||||
CR[i] = (CRx[j] + CRx[j+1] + CRx[j+16] + CRx[j+17]) * 0.25f;
|
||||
}
|
||||
}
|
||||
else if (format == JO_YUYV)
|
||||
{
|
||||
for (int i = 0; i < 256; i += 2)
|
||||
{
|
||||
int y = vblock * 16 + (i / 16);
|
||||
int x = hblock * 16 + (i & 15);
|
||||
x = x >= width ? width - 1 : x;
|
||||
y = y >= height ? height - 1 : y;
|
||||
if (flipx)
|
||||
x = width - 1 - x;
|
||||
if (flipy)
|
||||
y = height - 1 - y;
|
||||
const unsigned char* c = raw + y * width * 2 + x * 2 - 2;
|
||||
if (flipx)
|
||||
{
|
||||
Y[i + 1] = c[0];
|
||||
CB[i / 4] = c[1];
|
||||
Y[i] = c[2];
|
||||
CR[i / 4] = c[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
Y[i] = c[2];
|
||||
CB[i / 4] = c[3];
|
||||
Y[i + 1] = c[4];
|
||||
CR[i / 4] = c[5];
|
||||
} else
|
||||
if (format == JO_YUYV) {
|
||||
for (int i=0; i<256; i+=2) {
|
||||
int y = vblock*16+(i/16);
|
||||
int x = hblock*16+(i&15);
|
||||
x = x >= width ? width-1 : x;
|
||||
y = y >= height ? height-1 : y;
|
||||
if (flipx) x = width - 1 - x;
|
||||
if (flipy) y = height - 1 - y;
|
||||
const unsigned char *c = raw + y*width*2+x*2-2;
|
||||
if (flipx) {
|
||||
Y[i+1] = c[0];
|
||||
CB[i/4] = c[1];
|
||||
Y[i] = c[2];
|
||||
CR[i/4] = c[3];
|
||||
} else {
|
||||
Y[i] = c[2];
|
||||
CB[i/4] = c[3];
|
||||
Y[i+1] = c[4];
|
||||
CR[i/4] = c[5];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int k1 = 0; k1 < 2; ++k1)
|
||||
{
|
||||
for (int k2 = 0; k2 < 2; ++k2)
|
||||
{
|
||||
for (int k1=0; k1<2; ++k1) {
|
||||
for (int k2=0; k2<2; ++k2) {
|
||||
float block[64];
|
||||
for (int i = 0; i < 64; i += 8)
|
||||
{
|
||||
int j = (i & 7) + (i & 56) * 2 + k1 * 8 * 16 + k2 * 8;
|
||||
memcpy(block + i, Y + j, 8 * sizeof(Y[0]));
|
||||
for (int i=0; i<64; i+=8) {
|
||||
int j = (i&7)+(i&56)*2 + k1*8*16 + k2*8;
|
||||
memcpy(block+i, Y+j, 8*sizeof(Y[0]));
|
||||
}
|
||||
lastDCY = jo_processDU(&bits, block, s_jo_HTDC_Y, lastDCY);
|
||||
}
|
||||
|
|
|
@ -1,37 +1,20 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2020 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum
|
||||
{
|
||||
typedef enum {
|
||||
JO_RGBX,
|
||||
JO_RGB24,
|
||||
JO_YUYV,
|
||||
} jo_mpeg_format_t;
|
||||
|
||||
typedef enum
|
||||
{
|
||||
typedef enum {
|
||||
JO_NONE,
|
||||
JO_FLIP_X,
|
||||
JO_FLIP_Y,
|
||||
} jo_mpeg_flip_t;
|
||||
|
||||
unsigned long jo_write_mpeg(unsigned char* mpeg_buf, const unsigned char* rgbx, int width, int height, int format, int flipx, int flipy);
|
||||
unsigned long jo_write_mpeg(unsigned char *mpeg_buf, const unsigned char *rgbx, int width, int height, int format, int flipx, int flipy);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -11,9 +11,9 @@
|
|||
#include <stdint.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define JPGD_NORETURN __declspec(noreturn)
|
||||
#define JPGD_NORETURN __declspec(noreturn)
|
||||
#elif defined(__GNUC__)
|
||||
#define JPGD_NORETURN __attribute__((noreturn))
|
||||
#define JPGD_NORETURN __attribute__ ((noreturn))
|
||||
#else
|
||||
#define JPGD_NORETURN
|
||||
#endif
|
||||
|
@ -23,11 +23,11 @@
|
|||
|
||||
namespace jpgd
|
||||
{
|
||||
typedef unsigned char uint8;
|
||||
typedef signed short int16;
|
||||
typedef unsigned char uint8;
|
||||
typedef signed short int16;
|
||||
typedef unsigned short uint16;
|
||||
typedef unsigned int uint;
|
||||
typedef signed int int32;
|
||||
typedef unsigned int uint;
|
||||
typedef signed int int32;
|
||||
|
||||
// Loads a JPEG image from a memory buffer or a file.
|
||||
// req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
|
||||
|
@ -40,42 +40,15 @@ namespace jpgd
|
|||
// Success/failure error codes.
|
||||
enum jpgd_status
|
||||
{
|
||||
JPGD_SUCCESS = 0,
|
||||
JPGD_FAILED = -1,
|
||||
JPGD_DONE = 1,
|
||||
JPGD_BAD_DHT_COUNTS = -256,
|
||||
JPGD_BAD_DHT_INDEX,
|
||||
JPGD_BAD_DHT_MARKER,
|
||||
JPGD_BAD_DQT_MARKER,
|
||||
JPGD_BAD_DQT_TABLE,
|
||||
JPGD_BAD_PRECISION,
|
||||
JPGD_BAD_HEIGHT,
|
||||
JPGD_BAD_WIDTH,
|
||||
JPGD_TOO_MANY_COMPONENTS,
|
||||
JPGD_BAD_SOF_LENGTH,
|
||||
JPGD_BAD_VARIABLE_MARKER,
|
||||
JPGD_BAD_DRI_LENGTH,
|
||||
JPGD_BAD_SOS_LENGTH,
|
||||
JPGD_BAD_SOS_COMP_ID,
|
||||
JPGD_W_EXTRA_BYTES_BEFORE_MARKER,
|
||||
JPGD_NO_ARITHMITIC_SUPPORT,
|
||||
JPGD_UNEXPECTED_MARKER,
|
||||
JPGD_NOT_JPEG,
|
||||
JPGD_UNSUPPORTED_MARKER,
|
||||
JPGD_BAD_DQT_LENGTH,
|
||||
JPGD_TOO_MANY_BLOCKS,
|
||||
JPGD_UNDEFINED_QUANT_TABLE,
|
||||
JPGD_UNDEFINED_HUFF_TABLE,
|
||||
JPGD_NOT_SINGLE_SCAN,
|
||||
JPGD_UNSUPPORTED_COLORSPACE,
|
||||
JPGD_UNSUPPORTED_SAMP_FACTORS,
|
||||
JPGD_DECODE_ERROR,
|
||||
JPGD_BAD_RESTART_MARKER,
|
||||
JPGD_BAD_SOS_SPECTRAL,
|
||||
JPGD_BAD_SOS_SUCCESSIVE,
|
||||
JPGD_STREAM_READ,
|
||||
JPGD_NOTENOUGHMEM,
|
||||
JPGD_TOO_MANY_SCANS
|
||||
JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
|
||||
JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
|
||||
JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
|
||||
JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
|
||||
JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
|
||||
JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
|
||||
JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
|
||||
JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER,
|
||||
JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM, JPGD_TOO_MANY_SCANS
|
||||
};
|
||||
|
||||
// Input stream interface.
|
||||
|
@ -86,8 +59,8 @@ namespace jpgd
|
|||
class jpeg_decoder_stream
|
||||
{
|
||||
public:
|
||||
jpeg_decoder_stream() {}
|
||||
virtual ~jpeg_decoder_stream() {}
|
||||
jpeg_decoder_stream() { }
|
||||
virtual ~jpeg_decoder_stream() { }
|
||||
|
||||
// The read() method is called when the internal input buffer is empty.
|
||||
// Parameters:
|
||||
|
@ -103,7 +76,7 @@ namespace jpgd
|
|||
class jpeg_decoder_file_stream : public jpeg_decoder_stream
|
||||
{
|
||||
jpeg_decoder_file_stream(const jpeg_decoder_file_stream&);
|
||||
jpeg_decoder_file_stream& operator=(const jpeg_decoder_file_stream&);
|
||||
jpeg_decoder_file_stream& operator =(const jpeg_decoder_file_stream&);
|
||||
|
||||
FILE* m_pFile;
|
||||
bool m_eof_flag, m_error_flag;
|
||||
|
@ -125,28 +98,13 @@ namespace jpgd
|
|||
uint m_ofs, m_size;
|
||||
|
||||
public:
|
||||
jpeg_decoder_mem_stream()
|
||||
: m_pSrc_data(NULL)
|
||||
, m_ofs(0)
|
||||
, m_size(0)
|
||||
{
|
||||
}
|
||||
jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size)
|
||||
: m_pSrc_data(pSrc_data)
|
||||
, m_ofs(0)
|
||||
, m_size(size)
|
||||
{
|
||||
}
|
||||
jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
|
||||
jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
|
||||
|
||||
virtual ~jpeg_decoder_mem_stream() {}
|
||||
virtual ~jpeg_decoder_mem_stream() { }
|
||||
|
||||
bool open(const uint8* pSrc_data, uint size);
|
||||
void close()
|
||||
{
|
||||
m_pSrc_data = NULL;
|
||||
m_ofs = 0;
|
||||
m_size = 0;
|
||||
}
|
||||
void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
|
||||
|
||||
virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
|
||||
};
|
||||
|
@ -156,15 +114,8 @@ namespace jpgd
|
|||
|
||||
enum
|
||||
{
|
||||
JPGD_IN_BUF_SIZE = 8192,
|
||||
JPGD_MAX_BLOCKS_PER_MCU = 10,
|
||||
JPGD_MAX_HUFF_TABLES = 8,
|
||||
JPGD_MAX_QUANT_TABLES = 4,
|
||||
JPGD_MAX_COMPONENTS = 4,
|
||||
JPGD_MAX_COMPS_IN_SCAN = 4,
|
||||
JPGD_MAX_BLOCKS_PER_ROW = 16384,
|
||||
JPGD_MAX_HEIGHT = 32768,
|
||||
JPGD_MAX_WIDTH = 32768
|
||||
JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
|
||||
JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 16384, JPGD_MAX_HEIGHT = 32768, JPGD_MAX_WIDTH = 32768
|
||||
};
|
||||
|
||||
typedef int16 jpgd_quant_t;
|
||||
|
@ -191,7 +142,7 @@ namespace jpgd
|
|||
int begin_decoding();
|
||||
|
||||
// Returns the next scan line.
|
||||
// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1).
|
||||
// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1).
|
||||
// Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
|
||||
// Returns JPGD_SUCCESS if a scan line has been returned.
|
||||
// Returns JPGD_DONE if all scan lines have been returned.
|
||||
|
@ -213,17 +164,17 @@ namespace jpgd
|
|||
|
||||
private:
|
||||
jpeg_decoder(const jpeg_decoder&);
|
||||
jpeg_decoder& operator=(const jpeg_decoder&);
|
||||
jpeg_decoder& operator =(const jpeg_decoder&);
|
||||
|
||||
typedef void (*pDecode_block_func)(jpeg_decoder*, int, int, int);
|
||||
|
||||
struct huff_tables
|
||||
{
|
||||
bool ac_table;
|
||||
uint look_up[256];
|
||||
uint look_up2[256];
|
||||
uint look_up[256];
|
||||
uint look_up2[256];
|
||||
uint8 code_size[JPGD_HUFF_CODE_SIZE_MAX_LENGTH];
|
||||
uint tree[JPGD_HUFF_TREE_MAX_LENGTH];
|
||||
uint tree[JPGD_HUFF_TREE_MAX_LENGTH];
|
||||
};
|
||||
|
||||
struct coeff_buf
|
||||
|
@ -263,26 +214,26 @@ namespace jpgd
|
|||
int m_comp_ident[JPGD_MAX_COMPONENTS]; // component's ID
|
||||
int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
|
||||
int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
|
||||
int m_comps_in_scan; // # of components in scan
|
||||
int m_comp_list[JPGD_MAX_COMPS_IN_SCAN]; // components in this scan
|
||||
int m_comp_dc_tab[JPGD_MAX_COMPONENTS]; // component's DC Huffman coding table selector
|
||||
int m_comp_ac_tab[JPGD_MAX_COMPONENTS]; // component's AC Huffman coding table selector
|
||||
int m_spectral_start; // spectral selection start
|
||||
int m_spectral_end; // spectral selection end
|
||||
int m_successive_low; // successive approximation low
|
||||
int m_successive_high; // successive approximation high
|
||||
int m_max_mcu_x_size; // MCU's max. X size in pixels
|
||||
int m_max_mcu_y_size; // MCU's max. Y size in pixels
|
||||
int m_comps_in_scan; // # of components in scan
|
||||
int m_comp_list[JPGD_MAX_COMPS_IN_SCAN]; // components in this scan
|
||||
int m_comp_dc_tab[JPGD_MAX_COMPONENTS]; // component's DC Huffman coding table selector
|
||||
int m_comp_ac_tab[JPGD_MAX_COMPONENTS]; // component's AC Huffman coding table selector
|
||||
int m_spectral_start; // spectral selection start
|
||||
int m_spectral_end; // spectral selection end
|
||||
int m_successive_low; // successive approximation low
|
||||
int m_successive_high; // successive approximation high
|
||||
int m_max_mcu_x_size; // MCU's max. X size in pixels
|
||||
int m_max_mcu_y_size; // MCU's max. Y size in pixels
|
||||
int m_blocks_per_mcu;
|
||||
int m_max_blocks_per_row;
|
||||
int m_mcus_per_row, m_mcus_per_col;
|
||||
int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
|
||||
int m_total_lines_left; // total # lines left in image
|
||||
int m_mcu_lines_left; // total # lines left in this MCU
|
||||
int m_total_lines_left; // total # lines left in image
|
||||
int m_mcu_lines_left; // total # lines left in this MCU
|
||||
int m_num_buffered_scanlines;
|
||||
int m_real_dest_bytes_per_scan_line;
|
||||
int m_dest_bytes_per_scan_line; // rounded up
|
||||
int m_dest_bytes_per_pixel; // 4 (RGB) or 1 (Y)
|
||||
int m_dest_bytes_per_scan_line; // rounded up
|
||||
int m_dest_bytes_per_pixel; // 4 (RGB) or 1 (Y)
|
||||
huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
|
||||
coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
|
||||
coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
|
||||
|
@ -324,12 +275,7 @@ namespace jpgd
|
|||
bool m_sample_buf_prev_valid;
|
||||
bool m_has_sse2;
|
||||
|
||||
inline int check_sample_buf_ofs(int ofs) const
|
||||
{
|
||||
assert(ofs >= 0);
|
||||
assert(ofs < m_max_blocks_per_row * 64);
|
||||
return ofs;
|
||||
}
|
||||
inline int check_sample_buf_ofs(int ofs) const { assert(ofs >= 0); assert(ofs < m_max_blocks_per_row * 64); return ofs; }
|
||||
void free_all_blocks();
|
||||
JPGD_NORETURN void stop_decoding(jpgd_status status);
|
||||
void* alloc(size_t n, bool zero = false);
|
||||
|
|
|
@ -24,26 +24,26 @@
|
|||
#include <immintrin.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
|
||||
#define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
|
||||
#else
|
||||
#define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
|
||||
#define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
#define BITS_INV_ACC 4
|
||||
#define SHIFT_INV_ROW 16 - BITS_INV_ACC
|
||||
#define SHIFT_INV_COL 1 + BITS_INV_ACC
|
||||
const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
|
||||
const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
|
||||
const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round
|
||||
const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
|
||||
const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
|
||||
const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round
|
||||
|
||||
JPGD_SIMD_ALIGN(short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||
JPGD_SIMD_ALIGN(short, shortM128_round_inv_row[8]) = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0};
|
||||
JPGD_SIMD_ALIGN(short, shortM128_round_inv_col[8]) = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL};
|
||||
JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8]) = {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
|
||||
JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195}; // cos * (2<<16) + 0.5
|
||||
JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8])= {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
|
||||
JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Table for rows 0,4 - constants are multiplied on cos_4_16
|
||||
|
@ -56,22 +56,22 @@ JPGD_SIMD_ALIGN(short, shortM128_tab_i_04[]) = {
|
|||
16384, -8867, 16384, -21407, // w13 w12 w09 w08
|
||||
16384, 8867, -16384, -21407, // w07 w06 w03 w02
|
||||
-16384, 21407, 16384, -8867, // w15 w14 w11 w10
|
||||
22725, 19266, 19266, -4520, // w21 w20 w17 w16
|
||||
22725, 19266, 19266, -4520, // w21 w20 w17 w16
|
||||
12873, -22725, 4520, -12873, // w29 w28 w25 w24
|
||||
12873, 4520, -22725, -12873, // w23 w22 w19 w18
|
||||
4520, 19266, 19266, -22725}; // w31 w30 w27 w26
|
||||
|
||||
// Table for rows 1,7 - constants are multiplied on cos_1_16
|
||||
// Table for rows 1,7 - constants are multiplied on cos_1_16
|
||||
//movq -> w05 w04 w01 w00
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tab_i_17[]) = {
|
||||
22725, 29692, 22725, 12299,
|
||||
22725, -12299, 22725, -29692, // w13 w12 w09 w08
|
||||
22725, 12299, -22725, -29692, // w07 w06 w03 w02
|
||||
-22725, 29692, 22725, -12299, // w15 w14 w11 w10
|
||||
31521, 26722, 26722, -6270, // w21 w20 w17 w16
|
||||
17855, -31521, 6270, -17855, // w29 w28 w25 w24
|
||||
17855, 6270, -31521, -17855, // w23 w22 w19 w18
|
||||
6270, 26722, 26722, -31521}; // w31 w30 w27 w26
|
||||
31521, 26722, 26722, -6270, // w21 w20 w17 w16
|
||||
17855, -31521, 6270, -17855, // w29 w28 w25 w24
|
||||
17855, 6270, -31521, -17855, // w23 w22 w19 w18
|
||||
6270, 26722, 26722, -31521}; // w31 w30 w27 w26
|
||||
|
||||
// Table for rows 2,6 - constants are multiplied on cos_2_16
|
||||
//movq -> w05 w04 w01 w00
|
||||
|
@ -80,10 +80,10 @@ JPGD_SIMD_ALIGN(short, shortM128_tab_i_26[]) = {
|
|||
21407, -11585, 21407, -27969, // w13 w12 w09 w08
|
||||
21407, 11585, -21407, -27969, // w07 w06 w03 w02
|
||||
-21407, 27969, 21407, -11585, // w15 w14 w11 w10
|
||||
29692, 25172, 25172, -5906, // w21 w20 w17 w16
|
||||
16819, -29692, 5906, -16819, // w29 w28 w25 w24
|
||||
16819, 5906, -29692, -16819, // w23 w22 w19 w18
|
||||
5906, 25172, 25172, -29692}; // w31 w30 w27 w26
|
||||
29692, 25172, 25172, -5906, // w21 w20 w17 w16
|
||||
16819, -29692, 5906, -16819, // w29 w28 w25 w24
|
||||
16819, 5906, -29692, -16819, // w23 w22 w19 w18
|
||||
5906, 25172, 25172, -29692}; // w31 w30 w27 w26
|
||||
// Table for rows 3,5 - constants are multiplied on cos_3_16
|
||||
//movq -> w05 w04 w01 w00
|
||||
JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = {
|
||||
|
@ -91,28 +91,28 @@ JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = {
|
|||
19266, -10426, 19266, -25172, // w13 w12 w09 w08
|
||||
19266, 10426, -19266, -25172, // w07 w06 w03 w02
|
||||
-19266, 25172, 19266, -10426, // w15 w14 w11 w10
|
||||
26722, 22654, 22654, -5315, // w21 w20 w17 w16
|
||||
15137, -26722, 5315, -15137, // w29 w28 w25 w24
|
||||
15137, 5315, -26722, -15137, // w23 w22 w19 w18
|
||||
5315, 22654, 22654, -26722}; // w31 w30 w27 w26
|
||||
26722, 22654, 22654, -5315, // w21 w20 w17 w16
|
||||
15137, -26722, 5315, -15137, // w29 w28 w25 w24
|
||||
15137, 5315, -26722, -15137, // w23 w22 w19 w18
|
||||
5315, 22654, 22654, -26722}; // w31 w30 w27 w26
|
||||
|
||||
JPGD_SIMD_ALIGN(short, shortM128_128[8]) = {128, 128, 128, 128, 128, 128, 128, 128};
|
||||
JPGD_SIMD_ALIGN(short, shortM128_128[8]) = { 128, 128, 128, 128, 128, 128, 128, 128 };
|
||||
|
||||
void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
||||
void idctSSEShortU8(const short *pInput, uint8_t * pOutputUB)
|
||||
{
|
||||
__m128i r_xmm0, r_xmm4;
|
||||
__m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7;
|
||||
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
short* pTab_i_04 = shortM128_tab_i_04;
|
||||
short* pTab_i_26 = shortM128_tab_i_26;
|
||||
short * pTab_i_04 = shortM128_tab_i_04;
|
||||
short * pTab_i_26 = shortM128_tab_i_26;
|
||||
|
||||
//Get pointers for this input and output
|
||||
pTab_i_04 = shortM128_tab_i_04;
|
||||
pTab_i_26 = shortM128_tab_i_26;
|
||||
|
||||
//Row 1 and Row 3
|
||||
r_xmm0 = _mm_load_si128((__m128i*)pInput);
|
||||
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[2 * 8]));
|
||||
r_xmm0 = _mm_load_si128((__m128i *) pInput);
|
||||
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[2*8]));
|
||||
|
||||
// *** Work on the data in xmm0
|
||||
//low shuffle mask = 0xd8 = 11 01 10 00
|
||||
|
@ -121,58 +121,58 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
|
||||
// copy short 2 and short 0 to all locations
|
||||
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
|
||||
|
||||
|
||||
// add to those copies
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
|
||||
|
||||
// shuffle mask = 0x55 = 01 01 01 01
|
||||
// copy short 3 and short 1 to all locations
|
||||
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
|
||||
|
||||
|
||||
// high shuffle mask = 0xd8 = 11 01 10 00
|
||||
// get short 6 and short 4 into bit positions 64-95
|
||||
// get short 7 and short 5 into bit positions 96-127
|
||||
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
|
||||
|
||||
|
||||
// add to short 3 and short 1
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
|
||||
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
|
||||
|
||||
// shuffle mask = 0xaa = 10 10 10 10
|
||||
// copy short 6 and short 4 to all locations
|
||||
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
|
||||
|
||||
|
||||
// shuffle mask = 0xaa = 11 11 11 11
|
||||
// copy short 7 and short 5 to all locations
|
||||
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
|
||||
|
||||
|
||||
// add to short 6 and short 4
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
|
||||
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
|
||||
|
||||
// *** Work on the data in xmm4
|
||||
// high shuffle mask = 0xd8 11 01 10 00
|
||||
// get short 6 and short 4 into bit positions 64-95
|
||||
// get short 7 and short 5 into bit positions 96-127
|
||||
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
|
||||
|
||||
|
||||
// (xmm0 short 2 and short 0 plus pSi) + some constants
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
|
||||
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
|
||||
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&shortM128_tab_i_26[0]));
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
|
||||
r_xmm2 = r_xmm1;
|
||||
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&shortM128_tab_i_26[8]));
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
|
||||
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
|
||||
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&shortM128_tab_i_26[16]));
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
|
||||
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&shortM128_tab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
|
||||
r_xmm6 = r_xmm5;
|
||||
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
|
||||
|
@ -187,37 +187,37 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
row2 = _mm_packs_epi32(r_xmm4, r_xmm6);
|
||||
|
||||
//Row 5 and row 7
|
||||
r_xmm0 = _mm_load_si128((__m128i*)(&pInput[4 * 8]));
|
||||
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[6 * 8]));
|
||||
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[4*8]));
|
||||
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[6*8]));
|
||||
|
||||
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
|
||||
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
|
||||
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
|
||||
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
|
||||
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
|
||||
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
|
||||
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
|
||||
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
|
||||
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&shortM128_tab_i_26[0]));
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
|
||||
r_xmm2 = r_xmm1;
|
||||
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&shortM128_tab_i_26[8]));
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
|
||||
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
|
||||
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&shortM128_tab_i_26[16]));
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
|
||||
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&shortM128_tab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
|
||||
r_xmm6 = r_xmm5;
|
||||
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
|
||||
|
@ -234,37 +234,37 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
//Row 4 and row 2
|
||||
pTab_i_04 = shortM128_tab_i_35;
|
||||
pTab_i_26 = shortM128_tab_i_17;
|
||||
r_xmm0 = _mm_load_si128((__m128i*)(&pInput[3 * 8]));
|
||||
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[1 * 8]));
|
||||
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[3*8]));
|
||||
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[1*8]));
|
||||
|
||||
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
|
||||
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
|
||||
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
|
||||
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
|
||||
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
|
||||
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
|
||||
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
|
||||
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
|
||||
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&pTab_i_26[0]));
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
|
||||
r_xmm2 = r_xmm1;
|
||||
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&pTab_i_26[8]));
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
|
||||
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
|
||||
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&pTab_i_26[16]));
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
|
||||
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&pTab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
|
||||
r_xmm6 = r_xmm5;
|
||||
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
|
||||
|
@ -279,37 +279,37 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
row1 = _mm_packs_epi32(r_xmm4, r_xmm6);
|
||||
|
||||
//Row 6 and row 8
|
||||
r_xmm0 = _mm_load_si128((__m128i*)(&pInput[5 * 8]));
|
||||
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[7 * 8]));
|
||||
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[5*8]));
|
||||
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[7*8]));
|
||||
|
||||
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
|
||||
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
|
||||
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
|
||||
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
|
||||
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
|
||||
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
|
||||
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
|
||||
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
|
||||
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
|
||||
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
|
||||
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
|
||||
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
|
||||
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&pTab_i_26[0]));
|
||||
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
|
||||
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
|
||||
r_xmm2 = r_xmm1;
|
||||
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&pTab_i_26[8]));
|
||||
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
|
||||
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
|
||||
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&pTab_i_26[16]));
|
||||
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
|
||||
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
|
||||
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&pTab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
|
||||
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
|
||||
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
|
||||
r_xmm6 = r_xmm5;
|
||||
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
|
||||
|
@ -323,13 +323,13 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
|
||||
row7 = _mm_packs_epi32(r_xmm4, r_xmm6);
|
||||
|
||||
r_xmm1 = _mm_load_si128((__m128i*)shortM128_tg_3_16);
|
||||
r_xmm1 = _mm_load_si128((__m128i *) shortM128_tg_3_16);
|
||||
r_xmm2 = row5;
|
||||
r_xmm3 = row3;
|
||||
r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1);
|
||||
|
||||
r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3);
|
||||
r_xmm5 = _mm_load_si128((__m128i*)shortM128_tg_1_16);
|
||||
r_xmm5 = _mm_load_si128((__m128i *) shortM128_tg_1_16);
|
||||
r_xmm6 = row7;
|
||||
r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5);
|
||||
|
||||
|
@ -339,7 +339,7 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
r_xmm7 = row6;
|
||||
|
||||
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3);
|
||||
r_xmm3 = _mm_load_si128((__m128i*)shortM128_tg_2_16);
|
||||
r_xmm3 = _mm_load_si128((__m128i *) shortM128_tg_2_16);
|
||||
r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1);
|
||||
r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3);
|
||||
r_xmm1 = r_xmm0;
|
||||
|
@ -347,11 +347,11 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6);
|
||||
r_xmm4 = _mm_adds_epi16(r_xmm4, row1);
|
||||
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4);
|
||||
r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i*)shortM128_one_corr));
|
||||
r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i *) shortM128_one_corr));
|
||||
r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1);
|
||||
r_xmm6 = r_xmm5;
|
||||
r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2);
|
||||
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i*)shortM128_one_corr));
|
||||
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_one_corr));
|
||||
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
|
||||
|
||||
//Intermediate results, needed later
|
||||
|
@ -359,9 +359,9 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
temp7 = r_xmm0;
|
||||
|
||||
r_xmm1 = r_xmm4;
|
||||
r_xmm0 = _mm_load_si128((__m128i*)shortM128_cos_4_16);
|
||||
r_xmm0 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
|
||||
r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5);
|
||||
r_xmm2 = _mm_load_si128((__m128i*)shortM128_cos_4_16);
|
||||
r_xmm2 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
|
||||
r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4);
|
||||
|
||||
//Intermediate results, needed later
|
||||
|
@ -377,24 +377,24 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
r_xmm6 = _mm_subs_epi16(r_xmm6, row4);
|
||||
r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2);
|
||||
|
||||
r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i*)shortM128_one_corr));
|
||||
r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i *) shortM128_one_corr));
|
||||
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1);
|
||||
r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i*)shortM128_one_corr));
|
||||
r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i *) shortM128_one_corr));
|
||||
|
||||
r_xmm2 = r_xmm5;
|
||||
r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7);
|
||||
r_xmm1 = r_xmm6;
|
||||
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i*)shortM128_round_inv_col));
|
||||
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_round_inv_col));
|
||||
r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7);
|
||||
r_xmm7 = temp7;
|
||||
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3);
|
||||
r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i*)shortM128_round_inv_col));
|
||||
r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i *) shortM128_round_inv_col));
|
||||
r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5);
|
||||
r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
|
||||
r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3);
|
||||
r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i*)shortM128_round_inv_corr));
|
||||
r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i *) shortM128_round_inv_corr));
|
||||
r_xmm3 = r_xmm6;
|
||||
r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i*)shortM128_round_inv_corr));
|
||||
r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i *) shortM128_round_inv_corr));
|
||||
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4);
|
||||
|
||||
//Store results for row 0
|
||||
|
@ -406,7 +406,7 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0);
|
||||
|
||||
//Store results for row 1
|
||||
//_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6);
|
||||
//_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6);
|
||||
__m128i r1 = r_xmm6;
|
||||
|
||||
r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL);
|
||||
|
@ -415,24 +415,24 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
|
||||
|
||||
//Store results for row 2
|
||||
//_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1);
|
||||
//_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1);
|
||||
__m128i r2 = r_xmm1;
|
||||
|
||||
r_xmm5 = _mm_subs_epi16(r_xmm5, temp7);
|
||||
r_xmm5 = _mm_subs_epi16(r_xmm5, temp7);
|
||||
r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL);
|
||||
|
||||
//Store results for row 7
|
||||
//_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5);
|
||||
//_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5);
|
||||
__m128i r7 = r_xmm5;
|
||||
|
||||
r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4);
|
||||
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
|
||||
r_xmm2 = _mm_subs_epi16(r_xmm2, temp3);
|
||||
r_xmm2 = _mm_subs_epi16(r_xmm2, temp3);
|
||||
r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
|
||||
r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL);
|
||||
|
||||
//Store results for row 3
|
||||
//_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6);
|
||||
//_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6);
|
||||
__m128i r3 = r_xmm6;
|
||||
|
||||
r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL);
|
||||
|
@ -446,17 +446,17 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
|
|||
__m128i r5 = r_xmm7;
|
||||
__m128i r6 = r_xmm3;
|
||||
|
||||
r0 = _mm_add_epi16(*(const __m128i*)shortM128_128, r0);
|
||||
r1 = _mm_add_epi16(*(const __m128i*)shortM128_128, r1);
|
||||
r2 = _mm_add_epi16(*(const __m128i*)shortM128_128, r2);
|
||||
r3 = _mm_add_epi16(*(const __m128i*)shortM128_128, r3);
|
||||
r4 = _mm_add_epi16(*(const __m128i*)shortM128_128, r4);
|
||||
r5 = _mm_add_epi16(*(const __m128i*)shortM128_128, r5);
|
||||
r6 = _mm_add_epi16(*(const __m128i*)shortM128_128, r6);
|
||||
r7 = _mm_add_epi16(*(const __m128i*)shortM128_128, r7);
|
||||
r0 = _mm_add_epi16(*(const __m128i *)shortM128_128, r0);
|
||||
r1 = _mm_add_epi16(*(const __m128i *)shortM128_128, r1);
|
||||
r2 = _mm_add_epi16(*(const __m128i *)shortM128_128, r2);
|
||||
r3 = _mm_add_epi16(*(const __m128i *)shortM128_128, r3);
|
||||
r4 = _mm_add_epi16(*(const __m128i *)shortM128_128, r4);
|
||||
r5 = _mm_add_epi16(*(const __m128i *)shortM128_128, r5);
|
||||
r6 = _mm_add_epi16(*(const __m128i *)shortM128_128, r6);
|
||||
r7 = _mm_add_epi16(*(const __m128i *)shortM128_128, r7);
|
||||
|
||||
((__m128i*)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
|
||||
((__m128i*)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
|
||||
((__m128i*)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
|
||||
((__m128i*)pOutputUB)[3] = _mm_packus_epi16(r6, r7);
|
||||
((__m128i *)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
|
||||
((__m128i *)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
|
||||
((__m128i *)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
|
||||
((__m128i *)pOutputUB)[3] = _mm_packus_epi16(r6, r7);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue