USB: restore formatting of standalone third party code

This commit is contained in:
GovanifY 2020-11-04 15:19:01 +01:00 committed by refractionpcsx2
parent 9da3d9a5bf
commit ede7fa86fa
5 changed files with 512 additions and 1344 deletions

View File

@ -35,163 +35,38 @@
#include "jo_mpeg.h"
// Huffman tables
static const unsigned char s_jo_HTDC_Y[9][2] = {{4, 3}, {0, 2}, {1, 2}, {5, 3}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}};
static const unsigned char s_jo_HTDC_C[9][2] = {{0, 2}, {1, 2}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {126, 7}, {254, 8}};
static const unsigned char s_jo_HTDC_Y[9][2] = {{4,3}, {0,2}, {1,2}, {5,3}, {6,3}, {14,4}, {30,5}, {62,6}, {126,7}};
static const unsigned char s_jo_HTDC_C[9][2] = {{0,2}, {1,2}, {2,2}, {6,3}, {14,4}, {30,5}, {62,6}, {126,7}, {254,8}};
static const unsigned char s_jo_HTAC[32][40][2] = {
{
{6, 3},
{8, 5},
{10, 6},
{12, 8},
{76, 9},
{66, 9},
{20, 11},
{58, 13},
{48, 13},
{38, 13},
{32, 13},
{52, 14},
{50, 14},
{48, 14},
{46, 14},
{62, 15},
{62, 15},
{58, 15},
{56, 15},
{54, 15},
{52, 15},
{50, 15},
{48, 15},
{46, 15},
{44, 15},
{42, 15},
{40, 15},
{38, 15},
{36, 15},
{34, 15},
{32, 15},
{48, 16},
{46, 16},
{44, 16},
{42, 16},
{40, 16},
{38, 16},
{36, 16},
{34, 16},
{32, 16},
},
{{6, 4}, {12, 7}, {74, 9}, {24, 11}, {54, 13}, {44, 14}, {42, 14}, {62, 16}, {60, 16}, {58, 16}, {56, 16}, {54, 16}, {52, 16}, {50, 16}, {38, 17}, {36, 17}, {34, 17}, {32, 17}},
{{10, 5}, {8, 8}, {22, 11}, {40, 13}, {40, 14}},
{{14, 6}, {72, 9}, {56, 13}, {38, 14}},
{{12, 6}, {30, 11}, {36, 13}},
{{14, 7}, {18, 11}, {36, 14}},
{{10, 7}, {60, 13}, {40, 17}},
{{8, 7}, {42, 13}},
{{14, 8}, {34, 13}},
{{10, 8}, {34, 14}},
{{78, 9}, {32, 14}},
{{70, 9}, {52, 17}},
{{68, 9}, {50, 17}},
{{64, 9}, {48, 17}},
{{28, 11}, {46, 17}},
{{26, 11}, {44, 17}},
{{16, 11}, {42, 17}},
{{62, 13}},
{{52, 13}},
{{50, 13}},
{{46, 13}},
{{44, 13}},
{{62, 14}},
{{60, 14}},
{{58, 14}},
{{56, 14}},
{{54, 14}},
{{62, 17}},
{{60, 17}},
{{58, 17}},
{{56, 17}},
{{54, 17}},
{{6,3},{8,5},{10,6},{12,8},{76,9},{66,9},{20,11},{58,13},{48,13},{38,13},{32,13},{52,14},{50,14},{48,14},{46,14},{62,15},{62,15},{58,15},{56,15},{54,15},{52,15},{50,15},{48,15},{46,15},{44,15},{42,15},{40,15},{38,15},{36,15},{34,15},{32,15},{48,16},{46,16},{44,16},{42,16},{40,16},{38,16},{36,16},{34,16},{32,16},},
{{6,4},{12,7},{74,9},{24,11},{54,13},{44,14},{42,14},{62,16},{60,16},{58,16},{56,16},{54,16},{52,16},{50,16},{38,17},{36,17},{34,17},{32,17}},
{{10,5},{8,8},{22,11},{40,13},{40,14}},
{{14,6},{72,9},{56,13},{38,14}},
{{12,6},{30,11},{36,13}}, {{14,7},{18,11},{36,14}}, {{10,7},{60,13},{40,17}},
{{8,7},{42,13}}, {{14,8},{34,13}}, {{10,8},{34,14}}, {{78,9},{32,14}}, {{70,9},{52,17}}, {{68,9},{50,17}}, {{64,9},{48,17}}, {{28,11},{46,17}}, {{26,11},{44,17}}, {{16,11},{42,17}},
{{62,13}}, {{52,13}}, {{50,13}}, {{46,13}}, {{44,13}}, {{62,14}}, {{60,14}}, {{58,14}}, {{56,14}}, {{54,14}}, {{62,17}}, {{60,17}}, {{58,17}}, {{56,17}}, {{54,17}},
};
static const float s_jo_quantTbl[64] = {
0.015625f,
0.005632f,
0.005035f,
0.004832f,
0.004808f,
0.005892f,
0.007964f,
0.013325f,
0.005632f,
0.004061f,
0.003135f,
0.003193f,
0.003338f,
0.003955f,
0.004898f,
0.008828f,
0.005035f,
0.003135f,
0.002816f,
0.003013f,
0.003299f,
0.003581f,
0.005199f,
0.009125f,
0.004832f,
0.003484f,
0.003129f,
0.003348f,
0.003666f,
0.003979f,
0.005309f,
0.009632f,
0.005682f,
0.003466f,
0.003543f,
0.003666f,
0.003906f,
0.004546f,
0.005774f,
0.009439f,
0.006119f,
0.004248f,
0.004199f,
0.004228f,
0.004546f,
0.005062f,
0.006124f,
0.009942f,
0.008883f,
0.006167f,
0.006096f,
0.005777f,
0.006078f,
0.006391f,
0.007621f,
0.012133f,
0.016780f,
0.011263f,
0.009907f,
0.010139f,
0.009849f,
0.010297f,
0.012133f,
0.019785f,
0.015625f,0.005632f,0.005035f,0.004832f,0.004808f,0.005892f,0.007964f,0.013325f,
0.005632f,0.004061f,0.003135f,0.003193f,0.003338f,0.003955f,0.004898f,0.008828f,
0.005035f,0.003135f,0.002816f,0.003013f,0.003299f,0.003581f,0.005199f,0.009125f,
0.004832f,0.003484f,0.003129f,0.003348f,0.003666f,0.003979f,0.005309f,0.009632f,
0.005682f,0.003466f,0.003543f,0.003666f,0.003906f,0.004546f,0.005774f,0.009439f,
0.006119f,0.004248f,0.004199f,0.004228f,0.004546f,0.005062f,0.006124f,0.009942f,
0.008883f,0.006167f,0.006096f,0.005777f,0.006078f,0.006391f,0.007621f,0.012133f,
0.016780f,0.011263f,0.009907f,0.010139f,0.009849f,0.010297f,0.012133f,0.019785f,
};
static const unsigned char s_jo_ZigZag[] = {0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42, 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53, 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60, 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63};
static const unsigned char s_jo_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
typedef struct
{
unsigned char* buf_ptr;
typedef struct {
unsigned char *buf_ptr;
int buf, cnt;
} jo_bits_t;
static void jo_writeBits(jo_bits_t* b, int value, int count)
{
static void jo_writeBits(jo_bits_t *b, int value, int count) {
b->cnt += count;
b->buf |= value << (24 - b->cnt);
while (b->cnt >= 8)
{
while(b->cnt >= 8) {
unsigned char c = (b->buf >> 16) & 255;
//putc(c, b->fp);
*(b->buf_ptr) = c & 0xff;
@ -201,8 +76,7 @@ static void jo_writeBits(jo_bits_t* b, int value, int count)
}
}
static void jo_DCT(float* d0, float* d1, float* d2, float* d3, float* d4, float* d5, float* d6, float* d7)
{
static void jo_DCT(float *d0, float *d1, float *d2, float *d3, float *d4, float *d5, float *d6, float *d7) {
float tmp0 = *d0 + *d7;
float tmp7 = *d0 - *d7;
float tmp1 = *d1 + *d6;
@ -213,52 +87,48 @@ static void jo_DCT(float* d0, float* d1, float* d2, float* d3, float* d4, float*
float tmp4 = *d3 - *d4;
// Even part
float tmp10 = tmp0 + tmp3; // phase 2
float tmp10 = tmp0 + tmp3; // phase 2
float tmp13 = tmp0 - tmp3;
float tmp11 = tmp1 + tmp2;
float tmp12 = tmp1 - tmp2;
*d0 = tmp10 + tmp11; // phase 3
*d0 = tmp10 + tmp11; // phase 3
*d4 = tmp10 - tmp11;
float z1 = (tmp12 + tmp13) * 0.707106781f; // c4
*d2 = tmp13 + z1; // phase 5
*d2 = tmp13 + z1; // phase 5
*d6 = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5; // phase 2
tmp10 = tmp4 + tmp5; // phase 2
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
// The rotator is modified from fig 4-8 to avoid extra negations.
float z5 = (tmp10 - tmp12) * 0.382683433f; // c6
float z2 = tmp10 * 0.541196100f + z5; // c2-c6
float z4 = tmp12 * 1.306562965f + z5; // c2+c6
float z3 = tmp11 * 0.707106781f; // c4
float z2 = tmp10 * 0.541196100f + z5; // c2-c6
float z4 = tmp12 * 1.306562965f + z5; // c2+c6
float z3 = tmp11 * 0.707106781f; // c4
float z11 = tmp7 + z3; // phase 5
float z11 = tmp7 + z3; // phase 5
float z13 = tmp7 - z3;
*d5 = z13 + z2; // phase 6
*d5 = z13 + z2; // phase 6
*d3 = z13 - z2;
*d1 = z11 + z4;
*d7 = z11 - z4;
}
static int jo_processDU(jo_bits_t* bits, float A[64], const unsigned char htdc[9][2], int DC)
{
for (int dataOff = 0; dataOff < 64; dataOff += 8)
{
jo_DCT(&A[dataOff], &A[dataOff + 1], &A[dataOff + 2], &A[dataOff + 3], &A[dataOff + 4], &A[dataOff + 5], &A[dataOff + 6], &A[dataOff + 7]);
static int jo_processDU(jo_bits_t *bits, float A[64], const unsigned char htdc[9][2], int DC) {
for(int dataOff=0; dataOff<64; dataOff+=8) {
jo_DCT(&A[dataOff], &A[dataOff+1], &A[dataOff+2], &A[dataOff+3], &A[dataOff+4], &A[dataOff+5], &A[dataOff+6], &A[dataOff+7]);
}
for (int dataOff = 0; dataOff < 8; ++dataOff)
{
jo_DCT(&A[dataOff], &A[dataOff + 8], &A[dataOff + 16], &A[dataOff + 24], &A[dataOff + 32], &A[dataOff + 40], &A[dataOff + 48], &A[dataOff + 56]);
for(int dataOff=0; dataOff<8; ++dataOff) {
jo_DCT(&A[dataOff], &A[dataOff+8], &A[dataOff+16], &A[dataOff+24], &A[dataOff+32], &A[dataOff+40], &A[dataOff+48], &A[dataOff+56]);
}
int Q[64];
for (int i = 0; i < 64; ++i)
{
float v = A[i] * s_jo_quantTbl[i];
for(int i=0; i<64; ++i) {
float v = A[i]*s_jo_quantTbl[i];
Q[s_jo_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
}
@ -266,48 +136,36 @@ static int jo_processDU(jo_bits_t* bits, float A[64], const unsigned char htdc[9
int aDC = DC < 0 ? -DC : DC;
int size = 0;
int tempval = aDC;
while (tempval)
{
while(tempval) {
size++;
tempval >>= 1;
}
jo_writeBits(bits, htdc[size][0], htdc[size][1]);
if (DC < 0)
aDC ^= (1 << size) - 1;
if(DC < 0) aDC ^= (1 << size) - 1;
jo_writeBits(bits, aDC, size);
int endpos = 63;
for (; (endpos > 0) && (Q[endpos] == 0); --endpos)
{ /* do nothing */
}
for (int i = 1; i <= endpos;)
{
for(; (endpos>0)&&(Q[endpos]==0); --endpos) { /* do nothing */ }
for(int i = 1; i <= endpos;) {
int run = 0;
while (Q[i] == 0 && i < endpos)
{
while (Q[i]==0 && i<endpos) {
++run;
++i;
}
int AC = Q[i++];
int aAC = AC < 0 ? -AC : AC;
int code = 0, size = 0;
if (run < 32 && aAC <= 40)
{
code = s_jo_HTAC[run][aAC - 1][0];
size = s_jo_HTAC[run][aAC - 1][1];
if (AC < 0)
code += 1;
if (run<32 && aAC<=40) {
code = s_jo_HTAC[run][aAC-1][0];
size = s_jo_HTAC[run][aAC-1][1];
if (AC < 0) code += 1;
}
if (!size)
{
if(!size) {
jo_writeBits(bits, 1, 6);
jo_writeBits(bits, run, 6);
if (AC < -127)
{
if (AC < -127) {
jo_writeBits(bits, 128, 12);
}
else if (AC > 127)
{
} else if(AC > 127) {
jo_writeBits(bits, 0, 12);
}
code = AC & 0xFFF;
@ -320,23 +178,17 @@ static int jo_processDU(jo_bits_t* bits, float A[64], const unsigned char htdc[9
return Q[0];
}
unsigned long jo_write_mpeg(unsigned char* mpeg_buf, const unsigned char* raw, int width, int height, int format, int flipx, int flipy)
{
unsigned long jo_write_mpeg(unsigned char *mpeg_buf, const unsigned char *raw, int width, int height, int format, int flipx, int flipy) {
int lastDCY = 128, lastDCCR = 128, lastDCCB = 128;
unsigned char* head = mpeg_buf;
unsigned char *head = mpeg_buf;
jo_bits_t bits = {mpeg_buf};
for (int vblock = 0; vblock < (height + 15) / 16; vblock++)
{
for (int hblock = 0; hblock < (width + 15) / 16; hblock++)
{
if (vblock == 0 && hblock == 0)
{
for (int vblock = 0; vblock < (height+15)/16; vblock++) {
for (int hblock = 0; hblock < (width+15)/16; hblock++) {
if (vblock == 0 && hblock == 0) {
jo_writeBits(&bits, 0b01, 2); // macroblock_type = intra+quant
jo_writeBits(&bits, 8, 5); // quantiser_scale_code = 8
}
else
{
jo_writeBits(&bits, 8, 5); // quantiser_scale_code = 8
} else {
jo_writeBits(&bits, 0b1, 1); // macroblock_address_increment
jo_writeBits(&bits, 0b1, 1); // macroblock_type = intra
}
@ -344,113 +196,87 @@ unsigned long jo_write_mpeg(unsigned char* mpeg_buf, const unsigned char* raw, i
float Y[256], CBx[256], CRx[256];
float CB[64], CR[64];
if (format == JO_RGBX)
{
for (int i = 0; i < 256; ++i)
{
int y = vblock * 16 + (i / 16);
int x = hblock * 16 + (i & 15);
x = x >= width ? width - 1 : x;
y = y >= height ? height - 1 : y;
if (flipx)
x = width - 1 - x;
if (flipy)
y = height - 1 - y;
const unsigned char* c = raw + y * width * 4 + x * 4;
if (format == JO_RGBX) {
for (int i=0; i<256; ++i) {
int y = vblock*16+(i/16);
int x = hblock*16+(i&15);
x = x >= width ? width-1 : x;
y = y >= height ? height-1 : y;
if (flipx) x = width - 1 - x;
if (flipy) y = height - 1 - y;
const unsigned char *c = raw + y*width*4+x*4;
float r, g, b;
if (flipx && flipy)
{
if (flipx && flipy) {
r = c[2], g = c[1], b = c[0];
}
else
{
} else {
r = c[0], g = c[1], b = c[2];
}
Y[i] = (0.299f * r + 0.587f * g + 0.114f * b) * (219.f / 255) + 16;
CBx[i] = (-0.299f * r - 0.587f * g + 0.886f * b) * (224.f / 255) + 128;
CRx[i] = (0.701f * r - 0.587f * g - 0.114f * b) * (224.f / 255) + 128;
Y[i] = (0.299f*r + 0.587f*g + 0.114f*b) * (219.f/255) + 16;
CBx[i] = (-0.299f*r - 0.587f*g + 0.886f*b) * (224.f/255) + 128;
CRx[i] = (0.701f*r - 0.587f*g - 0.114f*b) * (224.f/255) + 128;
}
// Downsample Cb,Cr (420 format)
for (int i = 0; i < 64; ++i)
{
int j = (i & 7) * 2 + (i & 56) * 4;
CB[i] = (CBx[j] + CBx[j + 1] + CBx[j + 16] + CBx[j + 17]) * 0.25f;
CR[i] = (CRx[j] + CRx[j + 1] + CRx[j + 16] + CRx[j + 17]) * 0.25f;
for (int i=0; i<64; ++i) {
int j =(i&7)*2 + (i&56)*4;
CB[i] = (CBx[j] + CBx[j+1] + CBx[j+16] + CBx[j+17]) * 0.25f;
CR[i] = (CRx[j] + CRx[j+1] + CRx[j+16] + CRx[j+17]) * 0.25f;
}
}
else if (format == JO_RGB24)
{
for (int i = 0; i < 256; ++i)
{
int y = vblock * 16 + (i / 16);
int x = hblock * 16 + (i & 15);
x = x >= width ? width - 1 : x;
y = y >= height ? height - 1 : y;
if (flipx)
x = width - 1 - x;
if (flipy)
y = height - 1 - y;
const unsigned char* c = raw + y * width * 3 + x * 3;
} else
if (format == JO_RGB24) {
for (int i=0; i<256; ++i) {
int y = vblock*16+(i/16);
int x = hblock*16+(i&15);
x = x >= width ? width-1 : x;
y = y >= height ? height-1 : y;
if (flipx) x = width - 1 - x;
if (flipy) y = height - 1 - y;
const unsigned char *c = raw + y*width*3+x*3;
float r, g, b;
if (flipx && flipy)
{
if (flipx && flipy) {
r = c[2], g = c[1], b = c[0];
}
else
{
} else {
r = c[0], g = c[1], b = c[2];
}
Y[i] = (0.299f * r + 0.587f * g + 0.114f * b) * (219.f / 255) + 16;
CBx[i] = (-0.299f * r - 0.587f * g + 0.886f * b) * (224.f / 255) + 128;
CRx[i] = (0.701f * r - 0.587f * g - 0.114f * b) * (224.f / 255) + 128;
Y[i] = (0.299f*r + 0.587f*g + 0.114f*b) * (219.f/255) + 16;
CBx[i] = (-0.299f*r - 0.587f*g + 0.886f*b) * (224.f/255) + 128;
CRx[i] = (0.701f*r - 0.587f*g - 0.114f*b) * (224.f/255) + 128;
}
// Downsample Cb,Cr (420 format)
for (int i = 0; i < 64; ++i)
{
int j = (i & 7) * 2 + (i & 56) * 4;
CB[i] = (CBx[j] + CBx[j + 1] + CBx[j + 16] + CBx[j + 17]) * 0.25f;
CR[i] = (CRx[j] + CRx[j + 1] + CRx[j + 16] + CRx[j + 17]) * 0.25f;
for (int i=0; i<64; ++i) {
int j =(i&7)*2 + (i&56)*4;
CB[i] = (CBx[j] + CBx[j+1] + CBx[j+16] + CBx[j+17]) * 0.25f;
CR[i] = (CRx[j] + CRx[j+1] + CRx[j+16] + CRx[j+17]) * 0.25f;
}
}
else if (format == JO_YUYV)
{
for (int i = 0; i < 256; i += 2)
{
int y = vblock * 16 + (i / 16);
int x = hblock * 16 + (i & 15);
x = x >= width ? width - 1 : x;
y = y >= height ? height - 1 : y;
if (flipx)
x = width - 1 - x;
if (flipy)
y = height - 1 - y;
const unsigned char* c = raw + y * width * 2 + x * 2 - 2;
if (flipx)
{
Y[i + 1] = c[0];
CB[i / 4] = c[1];
Y[i] = c[2];
CR[i / 4] = c[3];
}
else
{
Y[i] = c[2];
CB[i / 4] = c[3];
Y[i + 1] = c[4];
CR[i / 4] = c[5];
} else
if (format == JO_YUYV) {
for (int i=0; i<256; i+=2) {
int y = vblock*16+(i/16);
int x = hblock*16+(i&15);
x = x >= width ? width-1 : x;
y = y >= height ? height-1 : y;
if (flipx) x = width - 1 - x;
if (flipy) y = height - 1 - y;
const unsigned char *c = raw + y*width*2+x*2-2;
if (flipx) {
Y[i+1] = c[0];
CB[i/4] = c[1];
Y[i] = c[2];
CR[i/4] = c[3];
} else {
Y[i] = c[2];
CB[i/4] = c[3];
Y[i+1] = c[4];
CR[i/4] = c[5];
}
}
}
for (int k1 = 0; k1 < 2; ++k1)
{
for (int k2 = 0; k2 < 2; ++k2)
{
for (int k1=0; k1<2; ++k1) {
for (int k2=0; k2<2; ++k2) {
float block[64];
for (int i = 0; i < 64; i += 8)
{
int j = (i & 7) + (i & 56) * 2 + k1 * 8 * 16 + k2 * 8;
memcpy(block + i, Y + j, 8 * sizeof(Y[0]));
for (int i=0; i<64; i+=8) {
int j = (i&7)+(i&56)*2 + k1*8*16 + k2*8;
memcpy(block+i, Y+j, 8*sizeof(Y[0]));
}
lastDCY = jo_processDU(&bits, block, s_jo_HTDC_Y, lastDCY);
}

View File

@ -1,37 +1,20 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2020 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#ifdef __cplusplus
extern "C" {
#endif
typedef enum
{
typedef enum {
JO_RGBX,
JO_RGB24,
JO_YUYV,
} jo_mpeg_format_t;
typedef enum
{
typedef enum {
JO_NONE,
JO_FLIP_X,
JO_FLIP_Y,
} jo_mpeg_flip_t;
unsigned long jo_write_mpeg(unsigned char* mpeg_buf, const unsigned char* rgbx, int width, int height, int format, int flipx, int flipy);
unsigned long jo_write_mpeg(unsigned char *mpeg_buf, const unsigned char *rgbx, int width, int height, int format, int flipx, int flipy);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load Diff

View File

@ -11,9 +11,9 @@
#include <stdint.h>
#ifdef _MSC_VER
#define JPGD_NORETURN __declspec(noreturn)
#define JPGD_NORETURN __declspec(noreturn)
#elif defined(__GNUC__)
#define JPGD_NORETURN __attribute__((noreturn))
#define JPGD_NORETURN __attribute__ ((noreturn))
#else
#define JPGD_NORETURN
#endif
@ -23,11 +23,11 @@
namespace jpgd
{
typedef unsigned char uint8;
typedef signed short int16;
typedef unsigned char uint8;
typedef signed short int16;
typedef unsigned short uint16;
typedef unsigned int uint;
typedef signed int int32;
typedef unsigned int uint;
typedef signed int int32;
// Loads a JPEG image from a memory buffer or a file.
// req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
@ -40,42 +40,15 @@ namespace jpgd
// Success/failure error codes.
enum jpgd_status
{
JPGD_SUCCESS = 0,
JPGD_FAILED = -1,
JPGD_DONE = 1,
JPGD_BAD_DHT_COUNTS = -256,
JPGD_BAD_DHT_INDEX,
JPGD_BAD_DHT_MARKER,
JPGD_BAD_DQT_MARKER,
JPGD_BAD_DQT_TABLE,
JPGD_BAD_PRECISION,
JPGD_BAD_HEIGHT,
JPGD_BAD_WIDTH,
JPGD_TOO_MANY_COMPONENTS,
JPGD_BAD_SOF_LENGTH,
JPGD_BAD_VARIABLE_MARKER,
JPGD_BAD_DRI_LENGTH,
JPGD_BAD_SOS_LENGTH,
JPGD_BAD_SOS_COMP_ID,
JPGD_W_EXTRA_BYTES_BEFORE_MARKER,
JPGD_NO_ARITHMITIC_SUPPORT,
JPGD_UNEXPECTED_MARKER,
JPGD_NOT_JPEG,
JPGD_UNSUPPORTED_MARKER,
JPGD_BAD_DQT_LENGTH,
JPGD_TOO_MANY_BLOCKS,
JPGD_UNDEFINED_QUANT_TABLE,
JPGD_UNDEFINED_HUFF_TABLE,
JPGD_NOT_SINGLE_SCAN,
JPGD_UNSUPPORTED_COLORSPACE,
JPGD_UNSUPPORTED_SAMP_FACTORS,
JPGD_DECODE_ERROR,
JPGD_BAD_RESTART_MARKER,
JPGD_BAD_SOS_SPECTRAL,
JPGD_BAD_SOS_SUCCESSIVE,
JPGD_STREAM_READ,
JPGD_NOTENOUGHMEM,
JPGD_TOO_MANY_SCANS
JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER,
JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM, JPGD_TOO_MANY_SCANS
};
// Input stream interface.
@ -86,8 +59,8 @@ namespace jpgd
class jpeg_decoder_stream
{
public:
jpeg_decoder_stream() {}
virtual ~jpeg_decoder_stream() {}
jpeg_decoder_stream() { }
virtual ~jpeg_decoder_stream() { }
// The read() method is called when the internal input buffer is empty.
// Parameters:
@ -103,7 +76,7 @@ namespace jpgd
class jpeg_decoder_file_stream : public jpeg_decoder_stream
{
jpeg_decoder_file_stream(const jpeg_decoder_file_stream&);
jpeg_decoder_file_stream& operator=(const jpeg_decoder_file_stream&);
jpeg_decoder_file_stream& operator =(const jpeg_decoder_file_stream&);
FILE* m_pFile;
bool m_eof_flag, m_error_flag;
@ -125,28 +98,13 @@ namespace jpgd
uint m_ofs, m_size;
public:
jpeg_decoder_mem_stream()
: m_pSrc_data(NULL)
, m_ofs(0)
, m_size(0)
{
}
jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size)
: m_pSrc_data(pSrc_data)
, m_ofs(0)
, m_size(size)
{
}
jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
virtual ~jpeg_decoder_mem_stream() {}
virtual ~jpeg_decoder_mem_stream() { }
bool open(const uint8* pSrc_data, uint size);
void close()
{
m_pSrc_data = NULL;
m_ofs = 0;
m_size = 0;
}
void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
};
@ -156,15 +114,8 @@ namespace jpgd
enum
{
JPGD_IN_BUF_SIZE = 8192,
JPGD_MAX_BLOCKS_PER_MCU = 10,
JPGD_MAX_HUFF_TABLES = 8,
JPGD_MAX_QUANT_TABLES = 4,
JPGD_MAX_COMPONENTS = 4,
JPGD_MAX_COMPS_IN_SCAN = 4,
JPGD_MAX_BLOCKS_PER_ROW = 16384,
JPGD_MAX_HEIGHT = 32768,
JPGD_MAX_WIDTH = 32768
JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 16384, JPGD_MAX_HEIGHT = 32768, JPGD_MAX_WIDTH = 32768
};
typedef int16 jpgd_quant_t;
@ -191,7 +142,7 @@ namespace jpgd
int begin_decoding();
// Returns the next scan line.
// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1).
// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1).
// Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
// Returns JPGD_SUCCESS if a scan line has been returned.
// Returns JPGD_DONE if all scan lines have been returned.
@ -213,17 +164,17 @@ namespace jpgd
private:
jpeg_decoder(const jpeg_decoder&);
jpeg_decoder& operator=(const jpeg_decoder&);
jpeg_decoder& operator =(const jpeg_decoder&);
typedef void (*pDecode_block_func)(jpeg_decoder*, int, int, int);
struct huff_tables
{
bool ac_table;
uint look_up[256];
uint look_up2[256];
uint look_up[256];
uint look_up2[256];
uint8 code_size[JPGD_HUFF_CODE_SIZE_MAX_LENGTH];
uint tree[JPGD_HUFF_TREE_MAX_LENGTH];
uint tree[JPGD_HUFF_TREE_MAX_LENGTH];
};
struct coeff_buf
@ -263,26 +214,26 @@ namespace jpgd
int m_comp_ident[JPGD_MAX_COMPONENTS]; // component's ID
int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
int m_comps_in_scan; // # of components in scan
int m_comp_list[JPGD_MAX_COMPS_IN_SCAN]; // components in this scan
int m_comp_dc_tab[JPGD_MAX_COMPONENTS]; // component's DC Huffman coding table selector
int m_comp_ac_tab[JPGD_MAX_COMPONENTS]; // component's AC Huffman coding table selector
int m_spectral_start; // spectral selection start
int m_spectral_end; // spectral selection end
int m_successive_low; // successive approximation low
int m_successive_high; // successive approximation high
int m_max_mcu_x_size; // MCU's max. X size in pixels
int m_max_mcu_y_size; // MCU's max. Y size in pixels
int m_comps_in_scan; // # of components in scan
int m_comp_list[JPGD_MAX_COMPS_IN_SCAN]; // components in this scan
int m_comp_dc_tab[JPGD_MAX_COMPONENTS]; // component's DC Huffman coding table selector
int m_comp_ac_tab[JPGD_MAX_COMPONENTS]; // component's AC Huffman coding table selector
int m_spectral_start; // spectral selection start
int m_spectral_end; // spectral selection end
int m_successive_low; // successive approximation low
int m_successive_high; // successive approximation high
int m_max_mcu_x_size; // MCU's max. X size in pixels
int m_max_mcu_y_size; // MCU's max. Y size in pixels
int m_blocks_per_mcu;
int m_max_blocks_per_row;
int m_mcus_per_row, m_mcus_per_col;
int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
int m_total_lines_left; // total # lines left in image
int m_mcu_lines_left; // total # lines left in this MCU
int m_total_lines_left; // total # lines left in image
int m_mcu_lines_left; // total # lines left in this MCU
int m_num_buffered_scanlines;
int m_real_dest_bytes_per_scan_line;
int m_dest_bytes_per_scan_line; // rounded up
int m_dest_bytes_per_pixel; // 4 (RGB) or 1 (Y)
int m_dest_bytes_per_scan_line; // rounded up
int m_dest_bytes_per_pixel; // 4 (RGB) or 1 (Y)
huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
@ -324,12 +275,7 @@ namespace jpgd
bool m_sample_buf_prev_valid;
bool m_has_sse2;
inline int check_sample_buf_ofs(int ofs) const
{
assert(ofs >= 0);
assert(ofs < m_max_blocks_per_row * 64);
return ofs;
}
inline int check_sample_buf_ofs(int ofs) const { assert(ofs >= 0); assert(ofs < m_max_blocks_per_row * 64); return ofs; }
void free_all_blocks();
JPGD_NORETURN void stop_decoding(jpgd_status status);
void* alloc(size_t n, bool zero = false);

View File

@ -24,26 +24,26 @@
#include <immintrin.h>
#ifdef _MSC_VER
#define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
#define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
#else
#define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
#define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
#endif
#define BITS_INV_ACC 4
#define SHIFT_INV_ROW 16 - BITS_INV_ACC
#define SHIFT_INV_COL 1 + BITS_INV_ACC
const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round
const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC); //1 << (SHIFT_INV_ROW-1)
const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3); // 1 << (SHIFT_INV_COL-1)
const short IRND_INV_CORR = IRND_INV_COL - 1; // correction -1.0 and round
JPGD_SIMD_ALIGN(short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1};
JPGD_SIMD_ALIGN(short, shortM128_round_inv_row[8]) = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0};
JPGD_SIMD_ALIGN(short, shortM128_round_inv_col[8]) = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL};
JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8]) = {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195}; // cos * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8])= {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5
//-----------------------------------------------------------------------------
// Table for rows 0,4 - constants are multiplied on cos_4_16
@ -56,22 +56,22 @@ JPGD_SIMD_ALIGN(short, shortM128_tab_i_04[]) = {
16384, -8867, 16384, -21407, // w13 w12 w09 w08
16384, 8867, -16384, -21407, // w07 w06 w03 w02
-16384, 21407, 16384, -8867, // w15 w14 w11 w10
22725, 19266, 19266, -4520, // w21 w20 w17 w16
22725, 19266, 19266, -4520, // w21 w20 w17 w16
12873, -22725, 4520, -12873, // w29 w28 w25 w24
12873, 4520, -22725, -12873, // w23 w22 w19 w18
4520, 19266, 19266, -22725}; // w31 w30 w27 w26
// Table for rows 1,7 - constants are multiplied on cos_1_16
// Table for rows 1,7 - constants are multiplied on cos_1_16
//movq -> w05 w04 w01 w00
JPGD_SIMD_ALIGN(short, shortM128_tab_i_17[]) = {
22725, 29692, 22725, 12299,
22725, -12299, 22725, -29692, // w13 w12 w09 w08
22725, 12299, -22725, -29692, // w07 w06 w03 w02
-22725, 29692, 22725, -12299, // w15 w14 w11 w10
31521, 26722, 26722, -6270, // w21 w20 w17 w16
17855, -31521, 6270, -17855, // w29 w28 w25 w24
17855, 6270, -31521, -17855, // w23 w22 w19 w18
6270, 26722, 26722, -31521}; // w31 w30 w27 w26
31521, 26722, 26722, -6270, // w21 w20 w17 w16
17855, -31521, 6270, -17855, // w29 w28 w25 w24
17855, 6270, -31521, -17855, // w23 w22 w19 w18
6270, 26722, 26722, -31521}; // w31 w30 w27 w26
// Table for rows 2,6 - constants are multiplied on cos_2_16
//movq -> w05 w04 w01 w00
@ -80,10 +80,10 @@ JPGD_SIMD_ALIGN(short, shortM128_tab_i_26[]) = {
21407, -11585, 21407, -27969, // w13 w12 w09 w08
21407, 11585, -21407, -27969, // w07 w06 w03 w02
-21407, 27969, 21407, -11585, // w15 w14 w11 w10
29692, 25172, 25172, -5906, // w21 w20 w17 w16
16819, -29692, 5906, -16819, // w29 w28 w25 w24
16819, 5906, -29692, -16819, // w23 w22 w19 w18
5906, 25172, 25172, -29692}; // w31 w30 w27 w26
29692, 25172, 25172, -5906, // w21 w20 w17 w16
16819, -29692, 5906, -16819, // w29 w28 w25 w24
16819, 5906, -29692, -16819, // w23 w22 w19 w18
5906, 25172, 25172, -29692}; // w31 w30 w27 w26
// Table for rows 3,5 - constants are multiplied on cos_3_16
//movq -> w05 w04 w01 w00
JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = {
@ -91,28 +91,28 @@ JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = {
19266, -10426, 19266, -25172, // w13 w12 w09 w08
19266, 10426, -19266, -25172, // w07 w06 w03 w02
-19266, 25172, 19266, -10426, // w15 w14 w11 w10
26722, 22654, 22654, -5315, // w21 w20 w17 w16
15137, -26722, 5315, -15137, // w29 w28 w25 w24
15137, 5315, -26722, -15137, // w23 w22 w19 w18
5315, 22654, 22654, -26722}; // w31 w30 w27 w26
26722, 22654, 22654, -5315, // w21 w20 w17 w16
15137, -26722, 5315, -15137, // w29 w28 w25 w24
15137, 5315, -26722, -15137, // w23 w22 w19 w18
5315, 22654, 22654, -26722}; // w31 w30 w27 w26
JPGD_SIMD_ALIGN(short, shortM128_128[8]) = {128, 128, 128, 128, 128, 128, 128, 128};
JPGD_SIMD_ALIGN(short, shortM128_128[8]) = { 128, 128, 128, 128, 128, 128, 128, 128 };
void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
void idctSSEShortU8(const short *pInput, uint8_t * pOutputUB)
{
__m128i r_xmm0, r_xmm4;
__m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7;
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
short* pTab_i_04 = shortM128_tab_i_04;
short* pTab_i_26 = shortM128_tab_i_26;
short * pTab_i_04 = shortM128_tab_i_04;
short * pTab_i_26 = shortM128_tab_i_26;
//Get pointers for this input and output
pTab_i_04 = shortM128_tab_i_04;
pTab_i_26 = shortM128_tab_i_26;
//Row 1 and Row 3
r_xmm0 = _mm_load_si128((__m128i*)pInput);
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[2 * 8]));
r_xmm0 = _mm_load_si128((__m128i *) pInput);
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[2*8]));
// *** Work on the data in xmm0
//low shuffle mask = 0xd8 = 11 01 10 00
@ -121,58 +121,58 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
// copy short 2 and short 0 to all locations
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
// add to those copies
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
// shuffle mask = 0x55 = 01 01 01 01
// copy short 3 and short 1 to all locations
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
// high shuffle mask = 0xd8 = 11 01 10 00
// get short 6 and short 4 into bit positions 64-95
// get short 7 and short 5 into bit positions 96-127
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
// add to short 3 and short 1
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
// shuffle mask = 0xaa = 10 10 10 10
// copy short 6 and short 4 to all locations
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
// shuffle mask = 0xaa = 11 11 11 11
// copy short 7 and short 5 to all locations
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
// add to short 6 and short 4
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
// *** Work on the data in xmm4
// high shuffle mask = 0xd8 11 01 10 00
// get short 6 and short 4 into bit positions 64-95
// get short 7 and short 5 into bit positions 96-127
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
// (xmm0 short 2 and short 0 plus pSi) + some constants
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&shortM128_tab_i_26[0]));
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&shortM128_tab_i_26[8]));
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&shortM128_tab_i_26[16]));
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&shortM128_tab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
@ -187,37 +187,37 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
row2 = _mm_packs_epi32(r_xmm4, r_xmm6);
//Row 5 and row 7
r_xmm0 = _mm_load_si128((__m128i*)(&pInput[4 * 8]));
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[6 * 8]));
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[4*8]));
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[6*8]));
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&shortM128_tab_i_26[0]));
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&shortM128_tab_i_26[8]));
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&shortM128_tab_i_26[16]));
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&shortM128_tab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
@ -234,37 +234,37 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
//Row 4 and row 2
pTab_i_04 = shortM128_tab_i_35;
pTab_i_26 = shortM128_tab_i_17;
r_xmm0 = _mm_load_si128((__m128i*)(&pInput[3 * 8]));
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[1 * 8]));
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[3*8]));
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[1*8]));
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&pTab_i_26[0]));
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&pTab_i_26[8]));
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&pTab_i_26[16]));
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&pTab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
@ -279,37 +279,37 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
row1 = _mm_packs_epi32(r_xmm4, r_xmm6);
//Row 6 and row 8
r_xmm0 = _mm_load_si128((__m128i*)(&pInput[5 * 8]));
r_xmm4 = _mm_load_si128((__m128i*)(&pInput[7 * 8]));
r_xmm0 = _mm_load_si128((__m128i *) (&pInput[5*8]));
r_xmm4 = _mm_load_si128((__m128i *) (&pInput[7*8]));
r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i*)pTab_i_04));
r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i*)&pTab_i_04[16]));
r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i*)&pTab_i_04[8]));
r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8]));
r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i*)shortM128_round_inv_row));
r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i*)&pTab_i_04[24]));
r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i*)&pTab_i_26[0]));
r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
r_xmm2 = r_xmm1;
r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i*)&pTab_i_26[8]));
r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i*)&pTab_i_26[16]));
r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16]));
r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i*)shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i*)&pTab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
r_xmm6 = r_xmm5;
r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
@ -323,13 +323,13 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
row7 = _mm_packs_epi32(r_xmm4, r_xmm6);
r_xmm1 = _mm_load_si128((__m128i*)shortM128_tg_3_16);
r_xmm1 = _mm_load_si128((__m128i *) shortM128_tg_3_16);
r_xmm2 = row5;
r_xmm3 = row3;
r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1);
r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3);
r_xmm5 = _mm_load_si128((__m128i*)shortM128_tg_1_16);
r_xmm5 = _mm_load_si128((__m128i *) shortM128_tg_1_16);
r_xmm6 = row7;
r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5);
@ -339,7 +339,7 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
r_xmm7 = row6;
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3);
r_xmm3 = _mm_load_si128((__m128i*)shortM128_tg_2_16);
r_xmm3 = _mm_load_si128((__m128i *) shortM128_tg_2_16);
r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1);
r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3);
r_xmm1 = r_xmm0;
@ -347,11 +347,11 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6);
r_xmm4 = _mm_adds_epi16(r_xmm4, row1);
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4);
r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i*)shortM128_one_corr));
r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i *) shortM128_one_corr));
r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1);
r_xmm6 = r_xmm5;
r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2);
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i*)shortM128_one_corr));
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_one_corr));
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
//Intermediate results, needed later
@ -359,9 +359,9 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
temp7 = r_xmm0;
r_xmm1 = r_xmm4;
r_xmm0 = _mm_load_si128((__m128i*)shortM128_cos_4_16);
r_xmm0 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5);
r_xmm2 = _mm_load_si128((__m128i*)shortM128_cos_4_16);
r_xmm2 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4);
//Intermediate results, needed later
@ -377,24 +377,24 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
r_xmm6 = _mm_subs_epi16(r_xmm6, row4);
r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2);
r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i*)shortM128_one_corr));
r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i *) shortM128_one_corr));
r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1);
r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i*)shortM128_one_corr));
r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i *) shortM128_one_corr));
r_xmm2 = r_xmm5;
r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7);
r_xmm1 = r_xmm6;
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i*)shortM128_round_inv_col));
r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_round_inv_col));
r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7);
r_xmm7 = temp7;
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3);
r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i*)shortM128_round_inv_col));
r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i *) shortM128_round_inv_col));
r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5);
r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3);
r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i*)shortM128_round_inv_corr));
r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i *) shortM128_round_inv_corr));
r_xmm3 = r_xmm6;
r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i*)shortM128_round_inv_corr));
r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i *) shortM128_round_inv_corr));
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4);
//Store results for row 0
@ -406,7 +406,7 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0);
//Store results for row 1
//_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6);
//_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6);
__m128i r1 = r_xmm6;
r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL);
@ -415,24 +415,24 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
//Store results for row 2
//_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1);
//_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1);
__m128i r2 = r_xmm1;
r_xmm5 = _mm_subs_epi16(r_xmm5, temp7);
r_xmm5 = _mm_subs_epi16(r_xmm5, temp7);
r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL);
//Store results for row 7
//_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5);
//_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5);
__m128i r7 = r_xmm5;
r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4);
r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
r_xmm2 = _mm_subs_epi16(r_xmm2, temp3);
r_xmm2 = _mm_subs_epi16(r_xmm2, temp3);
r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL);
//Store results for row 3
//_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6);
//_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6);
__m128i r3 = r_xmm6;
r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL);
@ -446,17 +446,17 @@ void idctSSEShortU8(const short* pInput, uint8_t* pOutputUB)
__m128i r5 = r_xmm7;
__m128i r6 = r_xmm3;
r0 = _mm_add_epi16(*(const __m128i*)shortM128_128, r0);
r1 = _mm_add_epi16(*(const __m128i*)shortM128_128, r1);
r2 = _mm_add_epi16(*(const __m128i*)shortM128_128, r2);
r3 = _mm_add_epi16(*(const __m128i*)shortM128_128, r3);
r4 = _mm_add_epi16(*(const __m128i*)shortM128_128, r4);
r5 = _mm_add_epi16(*(const __m128i*)shortM128_128, r5);
r6 = _mm_add_epi16(*(const __m128i*)shortM128_128, r6);
r7 = _mm_add_epi16(*(const __m128i*)shortM128_128, r7);
r0 = _mm_add_epi16(*(const __m128i *)shortM128_128, r0);
r1 = _mm_add_epi16(*(const __m128i *)shortM128_128, r1);
r2 = _mm_add_epi16(*(const __m128i *)shortM128_128, r2);
r3 = _mm_add_epi16(*(const __m128i *)shortM128_128, r3);
r4 = _mm_add_epi16(*(const __m128i *)shortM128_128, r4);
r5 = _mm_add_epi16(*(const __m128i *)shortM128_128, r5);
r6 = _mm_add_epi16(*(const __m128i *)shortM128_128, r6);
r7 = _mm_add_epi16(*(const __m128i *)shortM128_128, r7);
((__m128i*)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
((__m128i*)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
((__m128i*)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
((__m128i*)pOutputUB)[3] = _mm_packus_epi16(r6, r7);
((__m128i *)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
((__m128i *)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
((__m128i *)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
((__m128i *)pOutputUB)[3] = _mm_packus_epi16(r6, r7);
}