diff --git a/libretro-common/formats/jpeg/rjpeg.c b/libretro-common/formats/jpeg/rjpeg.c index 81ca237750..44be7d75ad 100644 --- a/libretro-common/formats/jpeg/rjpeg.c +++ b/libretro-common/formats/jpeg/rjpeg.c @@ -1,906 +1,2393 @@ -#include #include +#include +#include /* ptrdiff_t on osx */ #include #include -#include +#include #include +#include #include -#define RJPEG_DECODE_SOF 0xC0 -#define RJPEG_DECODE_DHT 0xC4 -#define RJPEG_DECODE_DQT 0xDB -#define RJPEG_DECODE_DRI 0xDD -#define RJPEG_DECODE_SCAN 0xDA -#define RJPEG_DECODE_SKIP_MARKER 0xFE - -#define CF(x) rjpeg_clip(((x) + 64) >> 7) -#define JPEG_DECODER_THROW(ctx, e) do { ctx->error = e; return; } while (0) - -enum rjpeg_decode_result -{ - RJPEG_OK = 0, - RJPEG_NOT_A_FILE, - RJPEG_UNSUPPORTED, - RJPEG_OOM, - RJPEG_INTERNAL_ERROR, - RJPEG_SYNTAX_ERROR, - RJPEG_INTERNAL_FINISHED -}; - enum { - CF4A = (-9), - CF4B = (111), - CF4C = (29), - CF4D = (-3), - CF3A = (28), - CF3B = (109), - CF3C = (-9), - CF3X = (104), - CF3Y = (27), - CF3Z = (-3), - CF2A = (139), - CF2B = (-11) + STBI_default = 0, /* only used for req_comp */ + STBI_grey = 1, + STBI_grey_alpha = 2, + STBI_rgb = 3, + STBI_rgb_alpha = 4 }; -enum +typedef struct { - W1 = 2841, - W2 = 2676, - W3 = 2408, - W5 = 1609, - W6 = 1108, - W7 = 565 -}; + int (*read) (void *user,char *data,int size); /* fill 'data' with 'size' bytes. return number of bytes actually read */ + void (*skip) (void *user,int n); /* skip the next 'n' bytes, or 'unget' the last -n bytes if negative */ + int (*eof) (void *user); /* returns nonzero if we are at end of file/data */ +} stbi_io_callbacks; -struct rjpeg_vlc_code -{ - uint8_t bits; - uint8_t code; -}; +/* should produce compiler error if size is wrong */ +typedef unsigned char validate_uint32[sizeof(uint32_t)==4 ? 1 : -1]; -struct rjpeg_component -{ - int cid; - int ssx, ssy; - int width, height; - int stride; - int qtsel; - int actabsel; - int dctabsel; - int dcpred; - uint8_t *pixels; -}; +#ifdef _MSC_VER +#define STBI_NOTUSED(v) (void)(v) +#else +#define STBI_NOTUSED(v) (void)sizeof(v) +#endif -struct rjpeg_data -{ - enum rjpeg_decode_result error; - const uint8_t *pos; - int size; - int length; - int width, height; - int mbwidth; - int mbheight; - int mbsizex; - int mbsizey; - int ncomp; - struct rjpeg_component comp[3]; - int qtused; - int qtavail; - uint8_t qtab[4][64]; - struct rjpeg_vlc_code vlctab[4][65536]; - int buf, bufbits; - int block[64]; - int rstinterval; - uint8_t *rgb; - char ZZ[64]; -}; +#ifdef _MSC_VER +#define STBI_HAS_LROTL +#endif -static INLINE uint8_t rjpeg_clip(const int x) +#ifdef STBI_HAS_LROTL + #define stbi_lrot(x,y) _lrotl(x,y) +#else + #define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y)))) +#endif + +// x86/x64 detection +#if defined(__x86_64__) || defined(_M_X64) +#define STBI__X64_TARGET +#elif defined(__i386) || defined(_M_IX86) +#define STBI__X86_TARGET +#endif + +#if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD) +/* NOTE: not clear do we actually need this for the 64-bit path? + * gcc doesn't support sse2 intrinsics unless you compile with -msse2, + * (but compiling with -msse2 allows the compiler to use SSE2 everywhere; + * this is just broken and gcc are jerks for not fixing it properly + * http://www.virtualdub.org/blog/pivot/entry.php?id=363 ) + */ +#define STBI_NO_SIMD +#endif + +#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD) +/* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET + * + * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the + * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant. + * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not + * simultaneously enabling "-mstackrealign". + * + * See https://github.com/nothings/stb/issues/81 for more information. + * + * So default to no SSE2 on 32-bit MinGW. If you've read this far and added + * -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2. + */ +#define STBI_NO_SIMD +#endif + +#if !defined(STBI_NO_SIMD) && defined(STBI__X86_TARGET) +#define STBI_SSE2 +#include + +#ifdef _MSC_VER + +#if _MSC_VER >= 1400 /* not VC6 */ +#include /* __cpuid */ +static int stbi__cpuid3(void) { - if (x < 0) - return 0; - return ((x > 0xFF) ? 0xFF : (unsigned char) x); + int info[4]; + __cpuid(info,1); + return info[3]; } - -static void rjpeg_skip(struct rjpeg_data *ctx, int count) +#else +static int stbi__cpuid3(void) { - ctx->pos += count; - ctx->size -= count; - ctx->length -= count; - if (ctx->size < 0) - ctx->error = RJPEG_SYNTAX_ERROR; -} - -static INLINE uint16_t rjpeg_decode_16(const uint8_t *pos) -{ - return (pos[0] << 8) | pos[1]; -} - -static INLINE void rjpeg_decode_length(struct rjpeg_data *ctx) -{ - if (ctx->size < 2) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - ctx->length = rjpeg_decode_16(ctx->pos); - if (ctx->length > ctx->size) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - rjpeg_skip(ctx, 2); -} - -static void rjpeg_decode_dqt(struct rjpeg_data *ctx) -{ - unsigned char *t = NULL; - - rjpeg_decode_length(ctx); - - while (ctx->length >= 65) - { - int i = ctx->pos[0]; - if (i & 0xFC) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - ctx->qtavail |= 1 << i; - t = &ctx->qtab[i][0]; - for (i = 0; i < 64; ++i) - t[i] = ctx->pos[i + 1]; - rjpeg_skip(ctx, 65); + int res; + __asm { + mov eax,1 + cpuid + mov res,edx } - - if (ctx->length) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); -} - -static void rjpeg_decode_dri(struct rjpeg_data *ctx) -{ - rjpeg_decode_length(ctx); - if (ctx->length < 2) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - ctx->rstinterval = rjpeg_decode_16(ctx->pos); - rjpeg_skip(ctx, ctx->length); -} - -static void rjpeg_decode_dht(struct rjpeg_data *ctx) -{ - unsigned char counts[16]; - struct rjpeg_vlc_code *vlc = NULL; - - rjpeg_decode_length(ctx); - - while (ctx->length >= 17) - { - int codelen; - int spread = 65536; - int remain = 65536; - int i = ctx->pos[0]; - - if (i & 0xEC) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - - if (i & 0x02) - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); - - i = (i | (i >> 3)) & 3; /* combined DC/AC + tableid value */ - for (codelen = 1; codelen <= 16; ++codelen) - counts[codelen - 1] = ctx->pos[codelen]; - rjpeg_skip(ctx, 17); - vlc = &ctx->vlctab[i][0]; - - for (codelen = 1; codelen <= 16; ++codelen) - { - int currcnt; - - spread >>= 1; - currcnt = counts[codelen - 1]; - if (!currcnt) - continue; - - if (ctx->length < currcnt) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - remain -= currcnt << (16 - codelen); - - if (remain < 0) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - - for (i = 0; i < currcnt; ++i) - { - int j; - unsigned char code = ctx->pos[i]; - - for (j = spread; j; --j) - { - vlc->bits = (unsigned char) codelen; - vlc->code = code; - ++vlc; - } - } - rjpeg_skip(ctx, currcnt); - } - - while (remain--) - { - vlc->bits = 0; - ++vlc; - } - } - - if (ctx->length) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); -} - -static int rjpeg_show_bits(struct rjpeg_data *ctx, int bits) -{ - unsigned char newbyte; - if (!bits) - return 0; - - while (ctx->bufbits < bits) - { - if (ctx->size <= 0) - { - ctx->buf = (ctx->buf << 8) | 0xFF; - ctx->bufbits += 8; - continue; - } - - newbyte = *ctx->pos++; - ctx->size--; - ctx->bufbits += 8; - ctx->buf = (ctx->buf << 8) | newbyte; - - if (newbyte == 0xFF) - { - if (ctx->size) - { - unsigned char marker = *ctx->pos++; - ctx->size--; - switch (marker) - { - case 0: - break; - case 0xD9: - ctx->size = 0; - break; - default: - if ((marker & 0xF8) != 0xD0) - ctx->error = RJPEG_SYNTAX_ERROR; - else - { - ctx->buf = (ctx->buf << 8) | marker; - ctx->bufbits += 8; - } - } - } else - ctx->error = RJPEG_SYNTAX_ERROR; - } - } - return (ctx->buf >> (ctx->bufbits - bits)) & ((1 << bits) - 1); -} - -static void rjpeg_skip_bits(struct rjpeg_data *ctx, int bits) -{ - if (ctx->bufbits < bits) - rjpeg_show_bits(ctx, bits); - ctx->bufbits -= bits; -} - -static int rjpeg_get_bits(struct rjpeg_data *ctx, int bits) -{ - int res = rjpeg_show_bits(ctx, bits); - rjpeg_skip_bits(ctx, bits); return res; } +#endif -static int rjpeg_get_vlc(struct rjpeg_data *ctx, - struct rjpeg_vlc_code *vlc, unsigned char* code) +#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name + +static int stbi__sse2_available() { - int value = rjpeg_show_bits(ctx, 16); - int bits = vlc[value].bits; + int info3 = stbi__cpuid3(); + return ((info3 >> 26) & 1) != 0; +} +#else /* assume GCC-style if not VC++ */ +#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) - if (!bits) - { - ctx->error = RJPEG_SYNTAX_ERROR; - return 0; - } +static int stbi__sse2_available() +{ +#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 /* GCC 4.8 or later */ + /* GCC 4.8+ has a nice way to do this */ + return __builtin_cpu_supports("sse2"); +#else + /* portable way to do this, preferably without using GCC inline ASM? + * just bail for now. */ + return 0; +#endif +} +#endif +#endif - rjpeg_skip_bits(ctx, bits); - value = vlc[value].code; - if (code) - *code = (unsigned char) value; - bits = value & 15; - if (!bits) - return 0; - value = rjpeg_get_bits(ctx, bits); - if (value < (1 << (bits - 1))) - value += ((-1) << bits) + 1; - return value; +/* ARM NEON */ +#if defined(STBI_NO_SIMD) && defined(STBI_NEON) +#undef STBI_NEON +#endif + +#ifdef STBI_NEON +#include +/* assume GCC or Clang on ARM targets */ +#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) +#endif + +#ifndef STBI_SIMD_ALIGN +#define STBI_SIMD_ALIGN(type, name) type name +#endif + +/////////////////////////////////////////////// +// +// stbi__context struct and start_xxx functions + +// stbi__context structure is our basic context used by all images, so it +// contains all the IO context, plus some basic image information +typedef struct +{ + uint32_t img_x, img_y; + int img_n, img_out_n; + + stbi_io_callbacks io; + void *io_user_data; + + int read_from_callbacks; + int buflen; + uint8_t buffer_start[128]; + + uint8_t *img_buffer, *img_buffer_end; + uint8_t *img_buffer_original; +} stbi__context; + + +static void stbi__refill_buffer(stbi__context *s); + +// initialize a memory-decode context +static void stbi__start_mem(stbi__context *s, const uint8_t *buffer, int len) +{ + s->io.read = NULL; + s->read_from_callbacks = 0; + s->img_buffer = s->img_buffer_original = (uint8_t *) buffer; + s->img_buffer_end = (uint8_t *) buffer+len; } -static void rjpeg_row_idct(int* blk) +static void stbi__rewind(stbi__context *s) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - if (!((x1 = blk[4] << 11) - | (x2 = blk[6]) - | (x3 = blk[2]) - | (x4 = blk[1]) - | (x5 = blk[7]) - | (x6 = blk[5]) - | (x7 = blk[3]))) - { - unsigned i; - int val = blk[0] << 3; - - for (i = 0; i < 8; i++) - blk[i] = val; - return; - } - - x0 = (blk[0] << 11) + 128; - x8 = W7 * (x4 + x5); - x4 = x8 + (W1 - W7) * x4; - x5 = x8 - (W1 + W7) * x5; - x8 = W3 * (x6 + x7); - x6 = x8 - (W3 - W5) * x6; - x7 = x8 - (W3 + W5) * x7; - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2); - x2 = x1 - (W2 + W6) * x2; - x3 = x1 + (W2 - W6) * x3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - blk[0] = (x7 + x1) >> 8; - blk[1] = (x3 + x2) >> 8; - blk[2] = (x0 + x4) >> 8; - blk[3] = (x8 + x6) >> 8; - blk[4] = (x8 - x6) >> 8; - blk[5] = (x0 - x4) >> 8; - blk[6] = (x3 - x2) >> 8; - blk[7] = (x7 - x1) >> 8; + /* conceptually rewind SHOULD rewind to the beginning of the stream, + * but we just rewind to the beginning of the initial buffer, because + * we only use it after doing 'test', which only ever looks at at most 92 bytes + */ + s->img_buffer = s->img_buffer_original; } -static void rjpeg_col_idct(const int* blk, unsigned char *out, int stride) +#ifndef STBI_NO_JPEG +static int stbi__jpeg_test(stbi__context *s); +static uint8_t *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp); +#endif + +// this is not threadsafe +static const char *stbi__g_failure_reason; + +static int stbi__err(const char *str) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - if (!((x1 = blk[8*4] << 8) - | (x2 = blk[8*6]) - | (x3 = blk[8*2]) - | (x4 = blk[8*1]) - | (x5 = blk[8*7]) - | (x6 = blk[8*5]) - | (x7 = blk[8*3]))) - { - x1 = rjpeg_clip(((blk[0] + 32) >> 6) + 128); - for (x0 = 8; x0; --x0) - { - *out = (unsigned char) x1; - out += stride; + stbi__g_failure_reason = str; + return 0; +} + +// stbi__err - error +// stbi__errpf - error returning pointer to float +// stbi__errpuc - error returning pointer to unsigned char + +#ifdef STBI_NO_FAILURE_STRINGS + #define stbi__err(x,y) 0 +#elif defined(STBI_FAILURE_USERMSG) + #define stbi__err(x,y) stbi__err(y) +#else + #define stbi__err(x,y) stbi__err(x) +#endif + +#define stbi__errpf(x,y) ((float *) (stbi__err(x,y)?NULL:NULL)) +#define stbi__errpuc(x,y) ((unsigned char *) (stbi__err(x,y)?NULL:NULL)) + +static int stbi__vertically_flip_on_load = 0; + +static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp) +{ + #ifndef STBI_NO_JPEG + if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp); + #endif + + return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt"); +} + +static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp) +{ + unsigned char *result = stbi__load_main(s, x, y, comp, req_comp); + + if (stbi__vertically_flip_on_load && result != NULL) { + int w = *x, h = *y; + int depth = req_comp ? req_comp : *comp; + int row,col,z; + uint8_t temp; + + // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once + for (row = 0; row < (h>>1); row++) { + for (col = 0; col < w; col++) { + for (z = 0; z < depth; z++) { + temp = result[(row * w + col) * depth + z]; + result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z]; + result[((h - row - 1) * w + col) * depth + z] = temp; + } + } } + } + + return result; +} + +static uint8_t *stbi_load_from_memory(const uint8_t *buffer, int len, int *x, int *y, int *comp, int req_comp) +{ + stbi__context s; + stbi__start_mem(&s,buffer,len); + return stbi__load_flip(&s,x,y,comp,req_comp); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Common code used by all image loaders +// + +enum +{ + STBI__SCAN_load=0, + STBI__SCAN_type, + STBI__SCAN_header +}; + +static void stbi__refill_buffer(stbi__context *s) +{ + int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen); + if (n == 0) { + // at end of file, treat same as if from memory, but need to handle case + // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file + s->read_from_callbacks = 0; + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start+1; + *s->img_buffer = 0; + } else { + s->img_buffer = s->buffer_start; + s->img_buffer_end = s->buffer_start + n; + } +} + +static INLINE uint8_t stbi__get8(stbi__context *s) +{ + if (s->img_buffer < s->img_buffer_end) + return *s->img_buffer++; + if (s->read_from_callbacks) { + stbi__refill_buffer(s); + return *s->img_buffer++; + } + return 0; +} + +static INLINE int stbi__at_eof(stbi__context *s) +{ + if (s->io.read) { + if (!(s->io.eof)(s->io_user_data)) return 0; + // if feof() is true, check if buffer = end + // special case: we've only got the special 0 character at the end + if (s->read_from_callbacks == 0) return 1; + } + + return s->img_buffer >= s->img_buffer_end; +} + +static void stbi__skip(stbi__context *s, int n) +{ + if (n < 0) { + s->img_buffer = s->img_buffer_end; return; } - x0 = (blk[0] << 8) + 8192; - x8 = W7 * (x4 + x5) + 4; - x4 = (x8 + (W1 - W7) * x4) >> 3; - x5 = (x8 - (W1 + W7) * x5) >> 3; - x8 = W3 * (x6 + x7) + 4; - x6 = (x8 - (W3 - W5) * x6) >> 3; - x7 = (x8 - (W3 + W5) * x7) >> 3; - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2) + 4; - x2 = (x1 - (W2 + W6) * x2) >> 3; - x3 = (x1 + (W2 - W6) * x3) >> 3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - *out = rjpeg_clip(((x7 + x1) >> 14) + 128); - out += stride; - *out = rjpeg_clip(((x3 + x2) >> 14) + 128); - out += stride; - *out = rjpeg_clip(((x0 + x4) >> 14) + 128); - out += stride; - *out = rjpeg_clip(((x8 + x6) >> 14) + 128); - out += stride; - *out = rjpeg_clip(((x8 - x6) >> 14) + 128); - out += stride; - *out = rjpeg_clip(((x0 - x4) >> 14) + 128); - out += stride; - *out = rjpeg_clip(((x3 - x2) >> 14) + 128); - out += stride; - *out = rjpeg_clip(((x7 - x1) >> 14) + 128); + if (s->io.read) { + int blen = (int) (s->img_buffer_end - s->img_buffer); + if (blen < n) { + s->img_buffer = s->img_buffer_end; + (s->io.skip)(s->io_user_data, n - blen); + return; + } + } + s->img_buffer += n; } -static INLINE void rjpeg_decode_block( - struct rjpeg_data *ctx, - struct rjpeg_component *c, - unsigned char* out) +static int stbi__get16be(stbi__context *s) { - unsigned char code = 0; - int coef = 0; + int z = stbi__get8(s); + return (z << 8) + stbi__get8(s); +} - memset(ctx->block, 0, sizeof(ctx->block)); +#define STBI__BYTECAST(x) ((uint8_t) ((x) & 255)) // truncate int to byte without warnings - c->dcpred += rjpeg_get_vlc(ctx, &ctx->vlctab[c->dctabsel][0], NULL); - ctx->block[0] = (c->dcpred) * ctx->qtab[c->qtsel][0]; +////////////////////////////////////////////////////////////////////////////// +// +// "baseline" JPEG/JFIF decoder +// +// simple implementation +// - doesn't support delayed output of y-dimension +// - simple interface (only one output format: 8-bit interleaved RGB) +// - doesn't try to recover corrupt jpegs +// - doesn't allow partial loading, loading multiple at once +// - still fast on x86 (copying globals into locals doesn't help x86) +// - allocates lots of intermediate memory (full size of all components) +// - non-interleaved case requires this anyway +// - allows good upsampling (see next) +// high-quality +// - upsampled channels are bilinearly interpolated, even across blocks +// - quality integer IDCT derived from IJG's 'slow' +// performance +// - fast huffman; reasonable integer IDCT +// - some SIMD kernels for common paths on targets with SSE2/NEON +// - uses a lot of intermediate memory, could cache poorly - do +#ifndef STBI_NO_JPEG + +// huffman decoding acceleration +#define FAST_BITS 9 // larger handles more cases; smaller stomps less cache + +typedef struct +{ + uint8_t fast[1 << FAST_BITS]; + // weirdly, repacking this into AoS is a 10% speed loss, instead of a win + uint16_t code[256]; + uint8_t values[256]; + uint8_t size[257]; + unsigned int maxcode[18]; + int delta[17]; // old 'firstsymbol' - old 'firstcode' +} stbi__huffman; + +typedef struct +{ + stbi__context *s; + stbi__huffman huff_dc[4]; + stbi__huffman huff_ac[4]; + uint8_t dequant[4][64]; + int16_t fast_ac[4][1 << FAST_BITS]; + +// sizes for components, interleaved MCUs + int img_h_max, img_v_max; + int img_mcu_x, img_mcu_y; + int img_mcu_w, img_mcu_h; + +// definition of jpeg image component + struct { - int value = rjpeg_get_vlc(ctx, &ctx->vlctab[c->actabsel][0], &code); + int id; + int h,v; + int tq; + int hd,ha; + int dc_pred; - if (!code) - break; /* EOB */ + int x,y,w2,h2; + uint8_t *data; + void *raw_data, *raw_coeff; + uint8_t *linebuf; + short *coeff; // progressive only + int coeff_w, coeff_h; // number of 8x8 coefficient blocks + } img_comp[4]; - if (!(code & 0x0F) && (code != 0xF0)) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - coef += (code >> 4) + 1; - if (coef > 63) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - ctx->block[(int) ctx->ZZ[coef]] = value * ctx->qtab[c->qtsel][coef]; - } while (coef < 63); + uint32_t code_buffer; // jpeg entropy-coded buffer + int code_bits; // number of valid bits + unsigned char marker; // marker seen while filling entropy buffer + int nomore; // flag if we saw a marker so must stop - for (coef = 0; coef < 64; coef += 8) - rjpeg_row_idct(&ctx->block[coef]); + int progressive; + int spec_start; + int spec_end; + int succ_high; + int succ_low; + int eob_run; - for (coef = 0; coef < 8; ++coef) - rjpeg_col_idct(&ctx->block[coef], &out[coef], c->stride); -} + int scan_n, order[4]; + int restart_interval, todo; +// kernels + void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]); + void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step); + uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs); +} stbi__jpeg; -static INLINE void rjpeg_byte_align(struct rjpeg_data *ctx) +static int stbi__build_huffman(stbi__huffman *h, int *count) { - ctx->bufbits &= 0xF8; + int i,j,k=0,code; + // build size list for each symbol (from JPEG spec) + for (i=0; i < 16; ++i) + for (j=0; j < count[i]; ++j) + h->size[k++] = (uint8_t) (i+1); + h->size[k] = 0; + + // compute actual symbols (from jpeg spec) + code = 0; + k = 0; + for(j=1; j <= 16; ++j) { + // compute delta to add to code to compute symbol id + h->delta[j] = k - code; + if (h->size[k] == j) { + while (h->size[k] == j) + h->code[k++] = (uint16_t) (code++); + if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG"); + } + // compute largest code + 1 for this size, preshifted as needed later + h->maxcode[j] = code << (16-j); + code <<= 1; + } + h->maxcode[j] = 0xffffffff; + + // build non-spec acceleration table; 255 is flag for not-accelerated + memset(h->fast, 255, 1 << FAST_BITS); + for (i=0; i < k; ++i) { + int s = h->size[i]; + if (s <= FAST_BITS) { + int c = h->code[i] << (FAST_BITS-s); + int m = 1 << (FAST_BITS-s); + for (j=0; j < m; ++j) { + h->fast[c+j] = (uint8_t) i; + } + } + } + return 1; } -static INLINE void rjpeg_skip_marker(struct rjpeg_data *ctx) -{ - rjpeg_decode_length(ctx); - rjpeg_skip(ctx, ctx->length); -} - -static void rjpeg_decode_sof(struct rjpeg_data *ctx) +// build a table that decodes both magnitude and value of small ACs in +// one go. +static void stbi__build_fast_ac(int16_t *fast_ac, stbi__huffman *h) { int i; - int ssxmax = 0; - int ssymax = 0; - struct rjpeg_component *c = NULL; + for (i=0; i < (1 << FAST_BITS); ++i) { + uint8_t fast = h->fast[i]; + fast_ac[i] = 0; + if (fast < 255) { + int rs = h->values[fast]; + int run = (rs >> 4) & 15; + int magbits = rs & 15; + int len = h->size[fast]; - rjpeg_decode_length(ctx); - - if (ctx->length < 9) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - if (ctx->pos[0] != 8) - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); - ctx->height = rjpeg_decode_16(ctx->pos+1); - ctx->width = rjpeg_decode_16(ctx->pos+3); - ctx->ncomp = ctx->pos[5]; - rjpeg_skip(ctx, 6); - - switch (ctx->ncomp) - { - case 1: - case 3: - break; - default: - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); + if (magbits && len + magbits <= FAST_BITS) { + // magnitude code followed by receive_extend code + int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); + int m = 1 << (magbits - 1); + if (k < m) k += (-1 << magbits) + 1; + // if the result is small enough, we can fit it in fast_ac table + if (k >= -128 && k <= 127) + fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits)); + } + } } - - if (ctx->length < (ctx->ncomp * 3)) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - - for (i = 0, c = ctx->comp; i < ctx->ncomp; ++i, ++c) - { - c->cid = ctx->pos[0]; - if (!(c->ssx = ctx->pos[1] >> 4)) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - if (c->ssx & (c->ssx - 1)) - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); /* non-power of two */ - if (!(c->ssy = ctx->pos[1] & 15)) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - if (c->ssy & (c->ssy - 1)) - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); /* non-power of two */ - if ((c->qtsel = ctx->pos[2]) & 0xFC) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - rjpeg_skip(ctx, 3); - ctx->qtused |= 1 << c->qtsel; - if (c->ssx > ssxmax) ssxmax = c->ssx; - if (c->ssy > ssymax) ssymax = c->ssy; - } - ctx->mbsizex = ssxmax << 3; - ctx->mbsizey = ssymax << 3; - ctx->mbwidth = (ctx->width + ctx->mbsizex - 1) / ctx->mbsizex; - ctx->mbheight = (ctx->height + ctx->mbsizey - 1) / ctx->mbsizey; - - for (i = 0, c = ctx->comp; i < ctx->ncomp; ++i, ++c) - { - c->width = (ctx->width * c->ssx + ssxmax - 1) / ssxmax; - c->stride = (c->width + 7) & 0x7FFFFFF8; - c->height = (ctx->height * c->ssy + ssymax - 1) / ssymax; - c->stride = ctx->mbwidth * ctx->mbsizex * c->ssx / ssxmax; - if (((c->width < 3) && (c->ssx != ssxmax)) || ((c->height < 3) && (c->ssy != ssymax))) - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); - if (!(c->pixels = (unsigned char*)malloc(c->stride * (ctx->mbheight * ctx->mbsizey * c->ssy / ssymax)))) - JPEG_DECODER_THROW(ctx, RJPEG_OOM); - } - - if (ctx->ncomp == 3) - { - ctx->rgb = (unsigned char*)malloc(ctx->width * ctx->height * ctx->ncomp); - if (!ctx->rgb) - JPEG_DECODER_THROW(ctx, RJPEG_OOM); - } - rjpeg_skip(ctx, ctx->length); } -static void rjpeg_decode_scan(struct rjpeg_data *ctx) +static void stbi__grow_buffer_unsafe(stbi__jpeg *j) { - int i, mbx, mby, sbx, sby; - int rstcount = ctx->rstinterval; - int nextrst = 0; - struct rjpeg_component *c = NULL; + do { + int b = j->nomore ? 0 : stbi__get8(j->s); + if (b == 0xff) { + int c = stbi__get8(j->s); + if (c != 0) { + j->marker = (unsigned char) c; + j->nomore = 1; + return; + } + } + j->code_buffer |= b << (24 - j->code_bits); + j->code_bits += 8; + } while (j->code_bits <= 24); +} - rjpeg_decode_length(ctx); +// (1 << n) - 1 +static uint32_t stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535}; - if (ctx->length < (4 + 2 * ctx->ncomp)) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - if (ctx->pos[0] != ctx->ncomp) - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); - rjpeg_skip(ctx, 1); - for (i = 0, c = ctx->comp; i < ctx->ncomp; ++i, ++c) - { - if (ctx->pos[0] != c->cid) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - if (ctx->pos[1] & 0xEE) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - c->dctabsel = ctx->pos[1] >> 4; - c->actabsel = (ctx->pos[1] & 1) | 2; - rjpeg_skip(ctx, 2); +// decode a jpeg huffman value from the bitstream +static INLINE int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) +{ + unsigned int temp; + int c,k; + + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); + + // look at the top FAST_BITS and determine what symbol ID it is, + // if the code is <= FAST_BITS + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); + k = h->fast[c]; + if (k < 255) { + int s = h->size[k]; + if (s > j->code_bits) + return -1; + j->code_buffer <<= s; + j->code_bits -= s; + return h->values[k]; } - if (ctx->pos[0] || (ctx->pos[1] != 63) || ctx->pos[2]) - JPEG_DECODER_THROW(ctx, RJPEG_UNSUPPORTED); + // naive test is to shift the code_buffer down so k bits are + // valid, then test against maxcode. To speed this up, we've + // preshifted maxcode left so that it has (16-k) 0s at the + // end; in other words, regardless of the number of bits, it + // wants to be compared against something shifted to have 16; + // that way we don't need to shift inside the loop. + temp = j->code_buffer >> 16; + for (k=FAST_BITS+1 ; ; ++k) + if (temp < h->maxcode[k]) + break; + if (k == 17) { + // error! code not found + j->code_bits -= 16; + return -1; + } - rjpeg_skip(ctx, ctx->length); + if (k > j->code_bits) + return -1; - for (mby = 0; mby < ctx->mbheight; ++mby) + // convert the huffman code to the symbol id + c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k]; + assert((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]); + + // convert the id to a symbol + j->code_bits -= k; + j->code_buffer <<= k; + return h->values[c]; +} + +// bias[n] = (-1<code_bits < n) stbi__grow_buffer_unsafe(j); + + sgn = (int32_t)j->code_buffer >> 31; // sign bit is always in MSB + k = stbi_lrot(j->code_buffer, n); + assert(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask))); + j->code_buffer = k & ~stbi__bmask[n]; + k &= stbi__bmask[n]; + j->code_bits -= n; + return k + (stbi__jbias[n] & ~sgn); +} + +// get some unsigned bits +static INLINE int stbi__jpeg_get_bits(stbi__jpeg *j, int n) +{ + unsigned int k; + if (j->code_bits < n) stbi__grow_buffer_unsafe(j); + k = stbi_lrot(j->code_buffer, n); + j->code_buffer = k & ~stbi__bmask[n]; + k &= stbi__bmask[n]; + j->code_bits -= n; + return k; +} + +static INLINE int stbi__jpeg_get_bit(stbi__jpeg *j) +{ + unsigned int k; + if (j->code_bits < 1) stbi__grow_buffer_unsafe(j); + k = j->code_buffer; + j->code_buffer <<= 1; + --j->code_bits; + return k & 0x80000000; +} + +// given a value that's at position X in the zigzag stream, +// where does it appear in the 8x8 matrix coded as row-major? +static uint8_t stbi__jpeg_dezigzag[64+15] = +{ + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63, + // let corrupt input sample past end + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63 +}; + +// decode one 64-entry block-- +static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, int16_t *fac, int b, uint8_t *dequant) +{ + int diff,dc,k; + int t; + + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); + t = stbi__jpeg_huff_decode(j, hdc); + if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG"); + + // 0 all the ac values now so we can do it 32-bits at a time + memset(data,0,64*sizeof(data[0])); + + diff = t ? stbi__extend_receive(j, t) : 0; + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + data[0] = (short) (dc * dequant[0]); + + // decode AC components, see JPEG spec + k = 1; + do { + unsigned int zig; + int c,r,s; + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + j->code_buffer <<= s; + j->code_bits -= s; + // decode into unzigzag'd location + zig = stbi__jpeg_dezigzag[k++]; + data[zig] = (short) ((r >> 8) * dequant[zig]); + } else { + int rs = stbi__jpeg_huff_decode(j, hac); + if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (rs != 0xf0) break; // end block + k += 16; + } else { + k += r; + // decode into unzigzag'd location + zig = stbi__jpeg_dezigzag[k++]; + data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]); + } + } + } while (k < 64); + return 1; +} + +static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b) +{ + if (j->spec_end != 0) + return stbi__err("can't merge dc and ac", "Corrupt JPEG"); + + if (j->code_bits < 16) + stbi__grow_buffer_unsafe(j); + + if (j->succ_high == 0) { - for (mbx = 0; mbx < ctx->mbwidth; ++mbx) - { - for (i = 0, c = ctx->comp; i < ctx->ncomp; ++i, ++c) - { - for (sby = 0; sby < c->ssy; ++sby) - { - for (sbx = 0; sbx < c->ssx; ++sbx) - { - rjpeg_decode_block(ctx, c, - &c->pixels[((mby * c->ssy + sby) * c->stride + mbx * c->ssx + sbx) << 3]); - if (ctx->error) - return; + int diff,dc; + int t; + + /* first scan for DC coefficient, must be first */ + memset(data,0,64*sizeof(data[0])); // 0 all the ac values now + t = stbi__jpeg_huff_decode(j, hdc); + diff = t ? stbi__extend_receive(j, t) : 0; + + dc = j->img_comp[b].dc_pred + diff; + j->img_comp[b].dc_pred = dc; + data[0] = (short) (dc << j->succ_low); + } + else + { + /* refinement scan for DC coefficient */ + if (stbi__jpeg_get_bit(j)) + data[0] += (short) (1 << j->succ_low); + } + return 1; +} + +// @OPTIMIZE: store non-zigzagged during the decode passes, +// and only de-zigzag when dequantizing +static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, int16_t *fac) +{ + int k; + if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); + + if (j->succ_high == 0) { + int shift = j->succ_low; + + if (j->eob_run) { + --j->eob_run; + return 1; + } + + k = j->spec_start; + do { + unsigned int zig; + int c,r,s; + if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); + c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); + r = fac[c]; + if (r) { // fast-AC path + k += (r >> 4) & 15; // run + s = r & 15; // combined length + j->code_buffer <<= s; + j->code_bits -= s; + zig = stbi__jpeg_dezigzag[k++]; + data[zig] = (short) ((r >> 8) << shift); + } else { + int rs = stbi__jpeg_huff_decode(j, hac); + if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r); + if (r) + j->eob_run += stbi__jpeg_get_bits(j, r); + --j->eob_run; + break; + } + k += 16; + } else { + k += r; + zig = stbi__jpeg_dezigzag[k++]; + data[zig] = (short) (stbi__extend_receive(j,s) << shift); + } + } + } while (k <= j->spec_end); + } else { + // refinement scan for these AC coefficients + + short bit = (short) (1 << j->succ_low); + + if (j->eob_run) { + --j->eob_run; + for (k = j->spec_start; k <= j->spec_end; ++k) { + short *p = &data[stbi__jpeg_dezigzag[k]]; + if (*p != 0) + if (stbi__jpeg_get_bit(j)) + if ((*p & bit)==0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } + } + } else { + k = j->spec_start; + do { + int r,s; + int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh + if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); + s = rs & 15; + r = rs >> 4; + if (s == 0) { + if (r < 15) { + j->eob_run = (1 << r) - 1; + if (r) + j->eob_run += stbi__jpeg_get_bits(j, r); + r = 64; // force end of block + } else { + // r=15 s=0 should write 16 0s, so we just do + // a run of 15 0s and then write s (which is 0), + // so we don't have to do anything special here + } + } else { + if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG"); + // sign bit + if (stbi__jpeg_get_bit(j)) + s = bit; + else + s = -bit; + } + + // advance by r + while (k <= j->spec_end) { + short *p = &data[stbi__jpeg_dezigzag[k++]]; + if (*p != 0) { + if (stbi__jpeg_get_bit(j)) + if ((*p & bit)==0) { + if (*p > 0) + *p += bit; + else + *p -= bit; + } + } else { + if (r == 0) { + *p = (short) s; + break; + } + --r; + } + } + } while (k <= j->spec_end); + } + } + return 1; +} + +// take a -128..127 value and stbi__clamp it and convert to 0..255 +static INLINE uint8_t stbi__clamp(int x) +{ + // trick to use a single test to catch both cases + if ((unsigned int) x > 255) { + if (x < 0) return 0; + if (x > 255) return 255; + } + return (uint8_t) x; +} + +#define stbi__f2f(x) ((int) (((x) * 4096 + 0.5))) +#define stbi__fsh(x) ((x) << 12) + +// derived from jidctint -- DCT_ISLOW +#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \ + int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \ + p2 = s2; \ + p3 = s6; \ + p1 = (p2+p3) * stbi__f2f(0.5411961f); \ + t2 = p1 + p3*stbi__f2f(-1.847759065f); \ + t3 = p1 + p2*stbi__f2f( 0.765366865f); \ + p2 = s0; \ + p3 = s4; \ + t0 = stbi__fsh(p2+p3); \ + t1 = stbi__fsh(p2-p3); \ + x0 = t0+t3; \ + x3 = t0-t3; \ + x1 = t1+t2; \ + x2 = t1-t2; \ + t0 = s7; \ + t1 = s5; \ + t2 = s3; \ + t3 = s1; \ + p3 = t0+t2; \ + p4 = t1+t3; \ + p1 = t0+t3; \ + p2 = t1+t2; \ + p5 = (p3+p4)*stbi__f2f( 1.175875602f); \ + t0 = t0*stbi__f2f( 0.298631336f); \ + t1 = t1*stbi__f2f( 2.053119869f); \ + t2 = t2*stbi__f2f( 3.072711026f); \ + t3 = t3*stbi__f2f( 1.501321110f); \ + p1 = p5 + p1*stbi__f2f(-0.899976223f); \ + p2 = p5 + p2*stbi__f2f(-2.562915447f); \ + p3 = p3*stbi__f2f(-1.961570560f); \ + p4 = p4*stbi__f2f(-0.390180644f); \ + t3 += p1+p4; \ + t2 += p2+p3; \ + t1 += p2+p4; \ + t0 += p1+p3; + +static void stbi__idct_block(uint8_t *out, int out_stride, short data[64]) +{ + int i,val[64],*v=val; + uint8_t *o; + short *d = data; + + // columns + for (i=0; i < 8; ++i,++d, ++v) { + // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing + if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0 + && d[40]==0 && d[48]==0 && d[56]==0) { + // no shortcut 0 seconds + // (1|2|3|4|5|6|7)==0 0 seconds + // all separate -0.047 seconds + // 1 && 2|3 && 4|5 && 6|7: -0.047 seconds + int dcterm = d[0] << 2; + v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm; + } else { + STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]) + // constants scaled things up by 1<<12; let's bring them back + // down, but keep 2 extra bits of precision + x0 += 512; x1 += 512; x2 += 512; x3 += 512; + v[ 0] = (x0+t3) >> 10; + v[56] = (x0-t3) >> 10; + v[ 8] = (x1+t2) >> 10; + v[48] = (x1-t2) >> 10; + v[16] = (x2+t1) >> 10; + v[40] = (x2-t1) >> 10; + v[24] = (x3+t0) >> 10; + v[32] = (x3-t0) >> 10; + } + } + + for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) { + // no fast case since the first 1D IDCT spread components out + STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]) + // constants scaled things up by 1<<12, plus we had 1<<2 from first + // loop, plus horizontal and vertical each scale by sqrt(8) so together + // we've got an extra 1<<3, so 1<<17 total we need to remove. + // so we want to round that, which means adding 0.5 * 1<<17, + // aka 65536. Also, we'll end up with -128 to 127 that we want + // to encode as 0..255 by adding 128, so we'll add that before the shift + x0 += 65536 + (128<<17); + x1 += 65536 + (128<<17); + x2 += 65536 + (128<<17); + x3 += 65536 + (128<<17); + // tried computing the shifts into temps, or'ing the temps to see + // if any were out of range, but that was slower + o[0] = stbi__clamp((x0+t3) >> 17); + o[7] = stbi__clamp((x0-t3) >> 17); + o[1] = stbi__clamp((x1+t2) >> 17); + o[6] = stbi__clamp((x1-t2) >> 17); + o[2] = stbi__clamp((x2+t1) >> 17); + o[5] = stbi__clamp((x2-t1) >> 17); + o[3] = stbi__clamp((x3+t0) >> 17); + o[4] = stbi__clamp((x3-t0) >> 17); + } +} + +#ifdef STBI_SSE2 +/* sse2 integer IDCT. not the fastest possible implementation but it + * produces bit-identical results to the generic C version so it's + * fully "transparent". + */ +static void stbi__idct_simd(uint8_t *out, int out_stride, short data[64]) +{ + /* This is constructed to match our regular (generic) integer IDCT exactly. */ + __m128i row0, row1, row2, row3, row4, row5, row6, row7; + __m128i tmp; + + /* dot product constant: even elems=x, odd elems=y */ + #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y)) + + /* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) + * out(1) = c1[even]*x + c1[odd]*y + */ + #define dct_rot(out0,out1, x,y,c0,c1) \ + __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \ + __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \ + __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ + __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ + __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ + __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) + + /* out = in << 12 (in 16-bit, out 32-bit) */ + #define dct_widen(out, in) \ + __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ + __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) + + /* wide add */ + #define dct_wadd(out, a, b) \ + __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_add_epi32(a##_h, b##_h) + + /* wide sub */ + #define dct_wsub(out, a, b) \ + __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ + __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) + + /* butterfly a/b, add bias, then shift by "s" and pack */ + #define dct_bfly32o(out0, out1, a,b,bias,s) \ + { \ + __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ + __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ + dct_wadd(sum, abiased, b); \ + dct_wsub(dif, abiased, b); \ + out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ + out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ + } + + /* 8-bit interleave step (for transposes) */ + #define dct_interleave8(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi8(a, b); \ + b = _mm_unpackhi_epi8(tmp, b) + + /* 16-bit interleave step (for transposes) */ + #define dct_interleave16(a, b) \ + tmp = a; \ + a = _mm_unpacklo_epi16(a, b); \ + b = _mm_unpackhi_epi16(tmp, b) + + #define dct_pass(bias,shift) \ + { \ + /* even part */ \ + dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \ + __m128i sum04 = _mm_add_epi16(row0, row4); \ + __m128i dif04 = _mm_sub_epi16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \ + dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \ + __m128i sum17 = _mm_add_epi16(row1, row7); \ + __m128i sum35 = _mm_add_epi16(row3, row5); \ + dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \ + dct_wadd(x4, y0o, y4o); \ + dct_wadd(x5, y1o, y5o); \ + dct_wadd(x6, y2o, y5o); \ + dct_wadd(x7, y3o, y4o); \ + dct_bfly32o(row0,row7, x0,x7,bias,shift); \ + dct_bfly32o(row1,row6, x1,x6,bias,shift); \ + dct_bfly32o(row2,row5, x2,x5,bias,shift); \ + dct_bfly32o(row3,row4, x3,x4,bias,shift); \ + } + + __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f)); + __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f)); + __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f)); + __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f)); + __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f)); + __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f)); + __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f)); + __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f)); + + /* rounding biases in column/row passes, see stbi__idct_block for explanation. */ + __m128i bias_0 = _mm_set1_epi32(512); + __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17)); + + /* load */ + row0 = _mm_load_si128((const __m128i *) (data + 0*8)); + row1 = _mm_load_si128((const __m128i *) (data + 1*8)); + row2 = _mm_load_si128((const __m128i *) (data + 2*8)); + row3 = _mm_load_si128((const __m128i *) (data + 3*8)); + row4 = _mm_load_si128((const __m128i *) (data + 4*8)); + row5 = _mm_load_si128((const __m128i *) (data + 5*8)); + row6 = _mm_load_si128((const __m128i *) (data + 6*8)); + row7 = _mm_load_si128((const __m128i *) (data + 7*8)); + + /* column pass */ + dct_pass(bias_0, 10); + + { + /* 16bit 8x8 transpose pass 1 */ + dct_interleave16(row0, row4); + dct_interleave16(row1, row5); + dct_interleave16(row2, row6); + dct_interleave16(row3, row7); + + /* transpose pass 2 */ + dct_interleave16(row0, row2); + dct_interleave16(row1, row3); + dct_interleave16(row4, row6); + dct_interleave16(row5, row7); + + /* transpose pass 3 */ + dct_interleave16(row0, row1); + dct_interleave16(row2, row3); + dct_interleave16(row4, row5); + dct_interleave16(row6, row7); + } + + /* row pass */ + dct_pass(bias_1, 17); + + { + /* pack */ + __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 + __m128i p1 = _mm_packus_epi16(row2, row3); + __m128i p2 = _mm_packus_epi16(row4, row5); + __m128i p3 = _mm_packus_epi16(row6, row7); + + // 8bit 8x8 transpose pass 1 + dct_interleave8(p0, p2); // a0e0a1e1... + dct_interleave8(p1, p3); // c0g0c1g1... + + // transpose pass 2 + dct_interleave8(p0, p1); // a0c0e0g0... + dct_interleave8(p2, p3); // b0d0f0h0... + + // transpose pass 3 + dct_interleave8(p0, p2); // a0b0c0d0... + dct_interleave8(p1, p3); // a4b4c4d4... + + // store + _mm_storel_epi64((__m128i *) out, p0); out += out_stride; + _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i *) out, p2); out += out_stride; + _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i *) out, p1); out += out_stride; + _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride; + _mm_storel_epi64((__m128i *) out, p3); out += out_stride; + _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e)); + } + +#undef dct_const +#undef dct_rot +#undef dct_widen +#undef dct_wadd +#undef dct_wsub +#undef dct_bfly32o +#undef dct_interleave8 +#undef dct_interleave16 +#undef dct_pass +} + +#endif /* STBI_SSE2 */ + +#ifdef STBI_NEON + +/* NEON integer IDCT. should produce bit-identical + * results to the generic C version. */ +static void stbi__idct_simd(uint8_t *out, int out_stride, short data[64]) +{ + int16x8_t row0, row1, row2, row3, row4, row5, row6, row7; + + int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f)); + int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f)); + int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f)); + int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f)); + int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f)); + int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f)); + int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f)); + int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f)); + int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f)); + int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f)); + int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f)); + int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f)); + +#define dct_long_mul(out, inq, coeff) \ + int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff) + +#define dct_long_mac(out, acc, inq, coeff) \ + int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \ + int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff) + +#define dct_widen(out, inq) \ + int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \ + int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12) + +/* wide add */ +#define dct_wadd(out, a, b) \ + int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vaddq_s32(a##_h, b##_h) + +/* wide sub */ +#define dct_wsub(out, a, b) \ + int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \ + int32x4_t out##_h = vsubq_s32(a##_h, b##_h) + +// butterfly a/b, then shift using "shiftop" by "s" and pack +#define dct_bfly32o(out0,out1, a,b,shiftop,s) \ + { \ + dct_wadd(sum, a, b); \ + dct_wsub(dif, a, b); \ + out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \ + out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \ + } + +#define dct_pass(shiftop, shift) \ + { \ + /* even part */ \ + int16x8_t sum26 = vaddq_s16(row2, row6); \ + dct_long_mul(p1e, sum26, rot0_0); \ + dct_long_mac(t2e, p1e, row6, rot0_1); \ + dct_long_mac(t3e, p1e, row2, rot0_2); \ + int16x8_t sum04 = vaddq_s16(row0, row4); \ + int16x8_t dif04 = vsubq_s16(row0, row4); \ + dct_widen(t0e, sum04); \ + dct_widen(t1e, dif04); \ + dct_wadd(x0, t0e, t3e); \ + dct_wsub(x3, t0e, t3e); \ + dct_wadd(x1, t1e, t2e); \ + dct_wsub(x2, t1e, t2e); \ + /* odd part */ \ + int16x8_t sum15 = vaddq_s16(row1, row5); \ + int16x8_t sum17 = vaddq_s16(row1, row7); \ + int16x8_t sum35 = vaddq_s16(row3, row5); \ + int16x8_t sum37 = vaddq_s16(row3, row7); \ + int16x8_t sumodd = vaddq_s16(sum17, sum35); \ + dct_long_mul(p5o, sumodd, rot1_0); \ + dct_long_mac(p1o, p5o, sum17, rot1_1); \ + dct_long_mac(p2o, p5o, sum35, rot1_2); \ + dct_long_mul(p3o, sum37, rot2_0); \ + dct_long_mul(p4o, sum15, rot2_1); \ + dct_wadd(sump13o, p1o, p3o); \ + dct_wadd(sump24o, p2o, p4o); \ + dct_wadd(sump23o, p2o, p3o); \ + dct_wadd(sump14o, p1o, p4o); \ + dct_long_mac(x4, sump13o, row7, rot3_0); \ + dct_long_mac(x5, sump24o, row5, rot3_1); \ + dct_long_mac(x6, sump23o, row3, rot3_2); \ + dct_long_mac(x7, sump14o, row1, rot3_3); \ + dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \ + dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \ + dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \ + dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \ + } + + // load + row0 = vld1q_s16(data + 0*8); + row1 = vld1q_s16(data + 1*8); + row2 = vld1q_s16(data + 2*8); + row3 = vld1q_s16(data + 3*8); + row4 = vld1q_s16(data + 4*8); + row5 = vld1q_s16(data + 5*8); + row6 = vld1q_s16(data + 6*8); + row7 = vld1q_s16(data + 7*8); + + // add DC bias + row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0)); + + // column pass + dct_pass(vrshrn_n_s32, 10); + + // 16bit 8x8 transpose + { +// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively. +// whether compilers actually get this is another story, sadly. +#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; } +#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); } +#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); } + + // pass 1 + dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 + dct_trn16(row2, row3); + dct_trn16(row4, row5); + dct_trn16(row6, row7); + + // pass 2 + dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 + dct_trn32(row1, row3); + dct_trn32(row4, row6); + dct_trn32(row5, row7); + + // pass 3 + dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 + dct_trn64(row1, row5); + dct_trn64(row2, row6); + dct_trn64(row3, row7); + +#undef dct_trn16 +#undef dct_trn32 +#undef dct_trn64 + } + + // row pass + // vrshrn_n_s32 only supports shifts up to 16, we need + // 17. so do a non-rounding shift of 16 first then follow + // up with a rounding shift by 1. + dct_pass(vshrn_n_s32, 16); + + { + /* pack and round */ + uint8x8_t p0 = vqrshrun_n_s16(row0, 1); + uint8x8_t p1 = vqrshrun_n_s16(row1, 1); + uint8x8_t p2 = vqrshrun_n_s16(row2, 1); + uint8x8_t p3 = vqrshrun_n_s16(row3, 1); + uint8x8_t p4 = vqrshrun_n_s16(row4, 1); + uint8x8_t p5 = vqrshrun_n_s16(row5, 1); + uint8x8_t p6 = vqrshrun_n_s16(row6, 1); + uint8x8_t p7 = vqrshrun_n_s16(row7, 1); + + /* again, these can translate into one instruction, but often don't. */ +#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; } +#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); } +#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); } + + /* sadly can't use interleaved stores here since we only write + * 8 bytes to each scan line! */ + + /* 8x8 8-bit transpose pass 1 */ + dct_trn8_8(p0, p1); + dct_trn8_8(p2, p3); + dct_trn8_8(p4, p5); + dct_trn8_8(p6, p7); + + /* pass 2 */ + dct_trn8_16(p0, p2); + dct_trn8_16(p1, p3); + dct_trn8_16(p4, p6); + dct_trn8_16(p5, p7); + + /* pass 3 */ + dct_trn8_32(p0, p4); + dct_trn8_32(p1, p5); + dct_trn8_32(p2, p6); + dct_trn8_32(p3, p7); + + /* store */ + vst1_u8(out, p0); out += out_stride; + vst1_u8(out, p1); out += out_stride; + vst1_u8(out, p2); out += out_stride; + vst1_u8(out, p3); out += out_stride; + vst1_u8(out, p4); out += out_stride; + vst1_u8(out, p5); out += out_stride; + vst1_u8(out, p6); out += out_stride; + vst1_u8(out, p7); + +#undef dct_trn8_8 +#undef dct_trn8_16 +#undef dct_trn8_32 + } + +#undef dct_long_mul +#undef dct_long_mac +#undef dct_widen +#undef dct_wadd +#undef dct_wsub +#undef dct_bfly32o +#undef dct_pass +} + +#endif /* STBI_NEON */ + +#define STBI__MARKER_none 0xff +/* if there's a pending marker from the entropy stream, return that + * otherwise, fetch from the stream and get a marker. if there's no + * marker, return 0xff, which is never a valid marker value + */ +static uint8_t stbi__get_marker(stbi__jpeg *j) +{ + uint8_t x; + if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; } + x = stbi__get8(j->s); + if (x != 0xff) return STBI__MARKER_none; + while (x == 0xff) + x = stbi__get8(j->s); + return x; +} + +/* in each scan, we'll have scan_n components, and the order + * of the components is specified by order[] + */ +#define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7) + +/* after a restart interval, stbi__jpeg_reset the entropy decoder and + * the dc prediction + */ +static void stbi__jpeg_reset(stbi__jpeg *j) +{ + j->code_bits = 0; + j->code_buffer = 0; + j->nomore = 0; + j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0; + j->marker = STBI__MARKER_none; + j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff; + j->eob_run = 0; + // no more than 1<<31 MCUs if no restart_interal? that's plenty safe, + // since we don't even allow 1<<30 pixels +} + +static int stbi__parse_entropy_coded_data(stbi__jpeg *z) +{ + stbi__jpeg_reset(z); + if (!z->progressive) { + if (z->scan_n == 1) { + int i,j; + STBI_SIMD_ALIGN(short, data[64]); + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x+7) >> 3; + int h = (z->img_comp[n].y+7) >> 3; + for (j=0; j < h; ++j) { + for (i=0; i < w; ++i) { + int ha = z->img_comp[n].ha; + if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; + z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data); + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); + // if it's NOT a restart, then just bail, so we get corrupt data + // rather than no data + if (!STBI__RESTART(z->marker)) return 1; + stbi__jpeg_reset(z); } } } - - if (ctx->rstinterval && !(--rstcount)) - { - rjpeg_byte_align(ctx); - i = rjpeg_get_bits(ctx, 16); - if (((i & 0xFFF8) != 0xFFD0) || ((i & 7) != nextrst)) - JPEG_DECODER_THROW(ctx, RJPEG_SYNTAX_ERROR); - nextrst = (nextrst + 1) & 7; - rstcount = ctx->rstinterval; - - for (i = 0; i < 3; ++i) - ctx->comp[i].dcpred = 0; + return 1; + } else { // interleaved + int i,j,k,x,y; + STBI_SIMD_ALIGN(short, data[64]); + for (j=0; j < z->img_mcu_y; ++j) { + for (i=0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k=0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y=0; y < z->img_comp[n].v; ++y) { + for (x=0; x < z->img_comp[n].h; ++x) { + int x2 = (i*z->img_comp[n].h + x)*8; + int y2 = (j*z->img_comp[n].v + y)*8; + int ha = z->img_comp[n].ha; + if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; + z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data); + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); + if (!STBI__RESTART(z->marker)) return 1; + stbi__jpeg_reset(z); + } + } } + return 1; + } + } else { + if (z->scan_n == 1) { + int i,j; + int n = z->order[0]; + // non-interleaved data, we just need to process one block at a time, + // in trivial scanline order + // number of blocks to do just depends on how many actual "pixels" this + // component has, independent of interleaved MCU blocking and such + int w = (z->img_comp[n].x+7) >> 3; + int h = (z->img_comp[n].y+7) >> 3; + for (j=0; j < h; ++j) { + for (i=0; i < w; ++i) { + short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + if (z->spec_start == 0) { + if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + return 0; + } else { + int ha = z->img_comp[n].ha; + if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) + return 0; + } + // every data block is an MCU, so countdown the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); + if (!STBI__RESTART(z->marker)) return 1; + stbi__jpeg_reset(z); + } + } + } + return 1; + } else { // interleaved + int i,j,k,x,y; + for (j=0; j < z->img_mcu_y; ++j) { + for (i=0; i < z->img_mcu_x; ++i) { + // scan an interleaved mcu... process scan_n components in order + for (k=0; k < z->scan_n; ++k) { + int n = z->order[k]; + // scan out an mcu's worth of this component; that's just determined + // by the basic H and V specified for the component + for (y=0; y < z->img_comp[n].v; ++y) { + for (x=0; x < z->img_comp[n].h; ++x) { + int x2 = (i*z->img_comp[n].h + x); + int y2 = (j*z->img_comp[n].v + y); + short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); + if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) + return 0; + } + } + } + // after all interleaved components, that's an interleaved MCU, + // so now count down the restart interval + if (--z->todo <= 0) { + if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); + if (!STBI__RESTART(z->marker)) return 1; + stbi__jpeg_reset(z); + } + } + } + return 1; } } - - ctx->error = RJPEG_INTERNAL_FINISHED; } -static void rjpeg_upsample_h(struct rjpeg_data *ctx, struct rjpeg_component *c) -{ - int x, y; - unsigned char *lin = NULL; - unsigned char *lout = NULL; - const int xmax = c->width - 3; - uint8_t *out = (uint8_t*)malloc((c->width * c->height) << 1); - if (!out) - JPEG_DECODER_THROW(ctx, RJPEG_OOM); - lin = c->pixels; - lout = out; - for (y = c->height; y; --y) - { - lout[0] = CF(CF2A * lin[0] + CF2B * lin[1]); - lout[1] = CF(CF3X * lin[0] + CF3Y * lin[1] + CF3Z * lin[2]); - lout[2] = CF(CF3A * lin[0] + CF3B * lin[1] + CF3C * lin[2]); - - for (x = 0; x < xmax; ++x) - { - lout[(x << 1) + 3] = CF(CF4A * lin[x] + CF4B * lin[x + 1] + CF4C * lin[x + 2] + CF4D * lin[x + 3]); - lout[(x << 1) + 4] = CF(CF4D * lin[x] + CF4C * lin[x + 1] + CF4B * lin[x + 2] + CF4A * lin[x + 3]); - } - - lin += c->stride; - lout += c->width << 1; - lout[-3] = CF(CF3A * lin[-1] + CF3B * lin[-2] + CF3C * lin[-3]); - lout[-2] = CF(CF3X * lin[-1] + CF3Y * lin[-2] + CF3Z * lin[-3]); - lout[-1] = CF(CF2A * lin[-1] + CF2B * lin[-2]); - } - c->width <<= 1; - c->stride = c->width; - free(c->pixels); - c->pixels = out; -} - -static void rjpeg_upsample_v(struct rjpeg_data *ctx, struct rjpeg_component *c) -{ - int x; - const int w = c->width, s1 = c->stride, s2 = s1 + s1; - unsigned char *out = (unsigned char*)malloc((c->width * c->height) << 1); - - for (x = 0; x < w; ++x) - { - int y; - unsigned char *cin = &c->pixels[x]; - unsigned char *cout = &out[x]; - - *cout = CF(CF2A * cin[0] + CF2B * cin[s1]); - cout += w; - - *cout = CF(CF3X * cin[0] + CF3Y * cin[s1] + CF3Z * cin[s2]); - cout += w; - - *cout = CF(CF3A * cin[0] + CF3B * cin[s1] + CF3C * cin[s2]); - cout += w; - - cin += s1; - for (y = c->height - 3; y; --y) - { - *cout = CF(CF4A * cin[-s1] + CF4B * cin[0] + CF4C * cin[s1] + CF4D * cin[s2]); - cout += w; - *cout = CF(CF4D * cin[-s1] + CF4C * cin[0] + CF4B * cin[s1] + CF4A * cin[s2]); - cout += w; - cin += s1; - } - cin += s1; - *cout = CF(CF3A * cin[0] + CF3B * cin[-s1] + CF3C * cin[-s2]); - cout += w; - *cout = CF(CF3X * cin[0] + CF3Y * cin[-s1] + CF3Z * cin[-s2]); - cout += w; - *cout = CF(CF2A * cin[0] + CF2B * cin[-s1]); - } - - c->height <<= 1; - c->stride = c->width; - - free(c->pixels); - c->pixels = out; -} - - -static void rjpeg_convert(struct rjpeg_data *ctx) +static void stbi__jpeg_dequantize(short *data, uint8_t *dequant) { int i; - struct rjpeg_component *c = NULL; + for (i=0; i < 64; ++i) + data[i] *= dequant[i]; +} - for (i = 0, c = ctx->comp; i < ctx->ncomp; ++i, ++c) - { - while ((c->width < ctx->width) || (c->height < ctx->height)) - { - if (c->width < ctx->width) - rjpeg_upsample_h(ctx, c); - - if (ctx->error) - return; - - if (c->height < ctx->height) - rjpeg_upsample_v(ctx, c); - - if (ctx->error) - return; - } - if ((c->width < ctx->width) || (c->height < ctx->height)) - JPEG_DECODER_THROW(ctx, RJPEG_INTERNAL_ERROR); - } - - if (ctx->ncomp == 3) - { - /* convert to RGB */ - int x, yy; - unsigned char *prgb = ctx->rgb; - const unsigned char *py = ctx->comp[0].pixels; - const unsigned char *pcb = ctx->comp[1].pixels; - const unsigned char *pcr = ctx->comp[2].pixels; - - for (yy = ctx->height; yy; --yy) - { - for (x = 0; x < ctx->width; ++x) - { - int y = py[x] << 8; - int cb = pcb[x] - 128; - int cr = pcr[x] - 128; - *prgb++ = rjpeg_clip((y + 359 * cr + 128) >> 8); - *prgb++ = rjpeg_clip((y - 88 * cb - 183 * cr + 128) >> 8); - *prgb++ = rjpeg_clip((y + 454 * cb + 128) >> 8); +static void stbi__jpeg_finish(stbi__jpeg *z) +{ + if (z->progressive) { + // dequantize and idct the data + int i,j,n; + for (n=0; n < z->s->img_n; ++n) { + int w = (z->img_comp[n].x+7) >> 3; + int h = (z->img_comp[n].y+7) >> 3; + for (j=0; j < h; ++j) { + for (i=0; i < w; ++i) { + short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); + stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); + z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data); + } } - py += ctx->comp[0].stride; - pcb += ctx->comp[1].stride; - pcr += ctx->comp[2].stride; } } - else if (ctx->comp[0].width != ctx->comp[0].stride) +} + +static int stbi__process_marker(stbi__jpeg *z, int m) +{ + int L; + switch (m) { + case STBI__MARKER_none: // no marker found + return stbi__err("expected marker","Corrupt JPEG"); + + case 0xDD: // DRI - specify restart interval + if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG"); + z->restart_interval = stbi__get16be(z->s); + return 1; + + case 0xDB: // DQT - define quantization table + L = stbi__get16be(z->s)-2; + while (L > 0) { + int q = stbi__get8(z->s); + int p = q >> 4; + int t = q & 15,i; + if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG"); + if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG"); + for (i=0; i < 64; ++i) + z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s); + L -= 65; + } + return L==0; + + case 0xC4: // DHT - define huffman table + L = stbi__get16be(z->s)-2; + while (L > 0) { + uint8_t *v; + int sizes[16],i,n=0; + int q = stbi__get8(z->s); + int tc = q >> 4; + int th = q & 15; + if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG"); + for (i=0; i < 16; ++i) { + sizes[i] = stbi__get8(z->s); + n += sizes[i]; + } + L -= 17; + if (tc == 0) { + if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0; + v = z->huff_dc[th].values; + } else { + if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0; + v = z->huff_ac[th].values; + } + for (i=0; i < n; ++i) + v[i] = stbi__get8(z->s); + if (tc != 0) + stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); + L -= n; + } + return L==0; + } + // check for comment block or APP blocks + if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { + stbi__skip(z->s, stbi__get16be(z->s)-2); + return 1; + } + return 0; +} + +// after we see SOS +static int stbi__process_scan_header(stbi__jpeg *z) +{ + int i; + int Ls = stbi__get16be(z->s); + + z->scan_n = stbi__get8(z->s); + + if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) + return stbi__err("bad SOS component count","Corrupt JPEG"); + if (Ls != 6+2*z->scan_n) + return stbi__err("bad SOS len","Corrupt JPEG"); + + for (i=0; i < z->scan_n; ++i) { - /* grayscale -> only remove stride */ - int y; - unsigned char *pin = &ctx->comp[0].pixels[ctx->comp[0].stride]; - unsigned char *pout = &ctx->comp[0].pixels[ctx->comp[0].width]; + int id = stbi__get8(z->s), which; + int q = stbi__get8(z->s); - for (y = ctx->comp[0].height - 1; y; --y) - { - memcpy(pout, pin, ctx->comp[0].width); - pin += ctx->comp[0].stride; - pout += ctx->comp[0].width; - } - ctx->comp[0].stride = ctx->comp[0].width; + for (which = 0; which < z->s->img_n; ++which) + if (z->img_comp[which].id == id) + break; + if (which == z->s->img_n) + return 0; /* no match */ + + z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3) + return stbi__err("bad DC huff","Corrupt JPEG"); + z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3) + return stbi__err("bad AC huff","Corrupt JPEG"); + z->order[i] = which; } -} - -enum rjpeg_decode_result rjpeg_decode( - struct rjpeg_data *ctx, - const unsigned char* jpeg, - const int size) -{ - ctx->pos = (const unsigned char*) jpeg; - ctx->size = size & 0x7FFFFFFF; - - if (ctx->size < 2) - return RJPEG_NOT_A_FILE; - if ((ctx->pos[0] ^ 0xFF) | (ctx->pos[1] ^ 0xD8)) - return RJPEG_NOT_A_FILE; - - rjpeg_skip(ctx, 2); - - while (!ctx->error) { - if ((ctx->size < 2) || (ctx->pos[0] != 0xFF)) - return RJPEG_SYNTAX_ERROR; - - rjpeg_skip(ctx, 2); - - switch (ctx->pos[-1]) - { - case RJPEG_DECODE_SOF: - rjpeg_decode_sof(ctx); - break; - case RJPEG_DECODE_DHT: - rjpeg_decode_dht(ctx); - break; - case RJPEG_DECODE_DQT: - rjpeg_decode_dqt(ctx); - break; - case RJPEG_DECODE_DRI: - rjpeg_decode_dri(ctx); - break; - case RJPEG_DECODE_SCAN: - rjpeg_decode_scan(ctx); - break; - case RJPEG_DECODE_SKIP_MARKER: - rjpeg_skip_marker(ctx); - break; - default: - if ((ctx->pos[-1] & 0xF0) != 0xE0) - return RJPEG_UNSUPPORTED; - rjpeg_skip_marker(ctx); - break; + int aa; + z->spec_start = stbi__get8(z->s); + z->spec_end = stbi__get8(z->s); /* should be 63, but might be 0 */ + aa = stbi__get8(z->s); + z->succ_high = (aa >> 4); + z->succ_low = (aa & 15); + if (z->progressive) { + if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) + return stbi__err("bad SOS", "Corrupt JPEG"); + } else { + if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG"); + if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG"); + z->spec_end = 63; } } - if (ctx->error != RJPEG_INTERNAL_FINISHED) - return ctx->error; - ctx->error = RJPEG_OK; - rjpeg_convert(ctx); - - return RJPEG_OK; + return 1; } -struct rjpeg_data *rjpeg_new(const uint8_t* data, size_t size) +static int stbi__process_frame_header(stbi__jpeg *z, int scan) { - char temp[64] = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, - 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, - 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, - 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }; - struct rjpeg_data *ctx = (struct rjpeg_data*)calloc(1, sizeof(*ctx)); + stbi__context *s = z->s; + int Lf,p,i,q, h_max=1,v_max=1,c; + Lf = stbi__get16be(s); if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG + p = stbi__get8(s); if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline + s->img_y = stbi__get16be(s); if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG + s->img_x = stbi__get16be(s); if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires + c = stbi__get8(s); + if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG"); // JFIF requires + s->img_n = c; + for (i=0; i < c; ++i) { + z->img_comp[i].data = NULL; + z->img_comp[i].linebuf = NULL; + } - if (!ctx) - return NULL; + if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG"); - memcpy(ctx->ZZ, temp, sizeof(ctx->ZZ)); - rjpeg_decode(ctx, data, size); + for (i=0; i < s->img_n; ++i) { + z->img_comp[i].id = stbi__get8(s); + if (z->img_comp[i].id != i+1) // JFIF requires + if (z->img_comp[i].id != i) // some version of jpegtran outputs non-JFIF-compliant files! + return stbi__err("bad component ID","Corrupt JPEG"); + q = stbi__get8(s); + z->img_comp[i].h = (q >> 4); if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG"); + z->img_comp[i].v = q & 15; if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG"); + z->img_comp[i].tq = stbi__get8(s); if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG"); + } - return ctx; + if (scan != STBI__SCAN_load) return 1; + + if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode"); + + for (i=0; i < s->img_n; ++i) { + if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h; + if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; + } + + // compute interleaved mcu info + z->img_h_max = h_max; + z->img_v_max = v_max; + z->img_mcu_w = h_max * 8; + z->img_mcu_h = v_max * 8; + z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w; + z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h; + + for (i=0; i < s->img_n; ++i) { + // number of effective pixels (e.g. for non-interleaved MCU) + z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max; + z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max; + // to simplify generation, we'll allocate enough memory to decode + // the bogus oversized data from using interleaved MCUs and their + // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't + // discard the extra data until colorspace conversion + z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8; + z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8; + z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15); + + if (z->img_comp[i].raw_data == NULL) { + for(--i; i >= 0; --i) { + free(z->img_comp[i].raw_data); + z->img_comp[i].data = NULL; + } + return stbi__err("outofmem", "Out of memory"); + } + // align blocks for idct using mmx/sse + z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15); + z->img_comp[i].linebuf = NULL; + if (z->progressive) { + z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3; + z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3; + z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15); + z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15); + } else { + z->img_comp[i].coeff = 0; + z->img_comp[i].raw_coeff = 0; + } + } + + return 1; } -static void rjpeg_free(struct rjpeg_data *ctx) +// use comparisons since in some cases we handle more than one case (e.g. SOF) +#define stbi__DNL(x) ((x) == 0xdc) +#define stbi__SOI(x) ((x) == 0xd8) +#define stbi__EOI(x) ((x) == 0xd9) +#define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2) +#define stbi__SOS(x) ((x) == 0xda) + +#define stbi__SOF_progressive(x) ((x) == 0xc2) + +static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) { - int i; - - for (i = 0; i < 3; ++i) - if (ctx->comp[i].pixels) - free((void*) ctx->comp[i].pixels); - if (ctx->rgb) - free((void*)ctx->rgb); + int m; + z->marker = STBI__MARKER_none; // initialize cached marker to empty + m = stbi__get_marker(z); + if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG"); + if (scan == STBI__SCAN_type) return 1; + m = stbi__get_marker(z); + while (!stbi__SOF(m)) { + if (!stbi__process_marker(z,m)) return 0; + m = stbi__get_marker(z); + while (m == STBI__MARKER_none) { + // some files have extra padding after their blocks, so ok, we'll scan + if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG"); + m = stbi__get_marker(z); + } + } + z->progressive = stbi__SOF_progressive(m); + if (!stbi__process_frame_header(z, scan)) return 0; + return 1; } -bool rjpeg_image_load(uint8_t *buf, void *data, size_t size, +// decode image to YCbCr format +static int stbi__decode_jpeg_image(stbi__jpeg *j) +{ + int m; + for (m = 0; m < 4; m++) { + j->img_comp[m].raw_data = NULL; + j->img_comp[m].raw_coeff = NULL; + } + j->restart_interval = 0; + if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0; + m = stbi__get_marker(j); + while (!stbi__EOI(m)) { + if (stbi__SOS(m)) { + if (!stbi__process_scan_header(j)) return 0; + if (!stbi__parse_entropy_coded_data(j)) return 0; + if (j->marker == STBI__MARKER_none ) { + // handle 0s at the end of image data from IP Kamera 9060 + while (!stbi__at_eof(j->s)) { + int x = stbi__get8(j->s); + if (x == 255) { + j->marker = stbi__get8(j->s); + break; + } else if (x != 0) { + return stbi__err("junk before marker", "Corrupt JPEG"); + } + } + // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0 + } + } else { + if (!stbi__process_marker(j, m)) return 0; + } + m = stbi__get_marker(j); + } + if (j->progressive) + stbi__jpeg_finish(j); + return 1; +} + +// static jfif-centered resampling (across block boundaries) + +typedef uint8_t *(*resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1, + int w, int hs); + +#define stbi__div4(x) ((uint8_t) ((x) >> 2)) + +static uint8_t *resample_row_1(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) +{ + STBI_NOTUSED(out); + STBI_NOTUSED(in_far); + STBI_NOTUSED(w); + STBI_NOTUSED(hs); + return in_near; +} + +static uint8_t* stbi__resample_row_v_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) +{ + // need to generate two samples vertically for every one in input + int i; + STBI_NOTUSED(hs); + for (i=0; i < w; ++i) + out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2); + return out; +} + +static uint8_t* stbi__resample_row_h_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) +{ + // need to generate two samples horizontally for every one in input + int i; + uint8_t *input = in_near; + + if (w == 1) { + // if only one sample, can't do any interpolation + out[0] = out[1] = input[0]; + return out; + } + + out[0] = input[0]; + out[1] = stbi__div4(input[0]*3 + input[1] + 2); + for (i=1; i < w-1; ++i) { + int n = 3*input[i]+2; + out[i*2+0] = stbi__div4(n+input[i-1]); + out[i*2+1] = stbi__div4(n+input[i+1]); + } + out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2); + out[i*2+1] = input[w-1]; + + STBI_NOTUSED(in_far); + STBI_NOTUSED(hs); + + return out; +} + +#define stbi__div16(x) ((uint8_t) ((x) >> 4)) + +static uint8_t *stbi__resample_row_hv_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) +{ + // need to generate 2x2 samples for every one in input + int i,t0,t1; + if (w == 1) { + out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2); + return out; + } + + t1 = 3*in_near[0] + in_far[0]; + out[0] = stbi__div4(t1+2); + for (i=1; i < w; ++i) { + t0 = t1; + t1 = 3*in_near[i]+in_far[i]; + out[i*2-1] = stbi__div16(3*t0 + t1 + 8); + out[i*2 ] = stbi__div16(3*t1 + t0 + 8); + } + out[w*2-1] = stbi__div4(t1+2); + + STBI_NOTUSED(hs); + + return out; +} + +#if defined(STBI_SSE2) || defined(STBI_NEON) +static uint8_t *stbi__resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) +{ + /* need to generate 2x2 samples for every one in input */ + int i=0,t0,t1; + + if (w == 1) { + out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2); + return out; + } + + t1 = 3*in_near[0] + in_far[0]; + /* process groups of 8 pixels for as long as we can. + * note we can't handle the last pixel in a row in this loop + * because we need to handle the filter boundary conditions. + */ + for (; i < ((w-1) & ~7); i += 8) + { +#if defined(STBI_SSE2) + /* load and perform the vertical filtering pass + * this uses 3*x + y = 4*x + (y - x) */ + __m128i zero = _mm_setzero_si128(); + __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i)); + __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i)); + __m128i farw = _mm_unpacklo_epi8(farb, zero); + __m128i nearw = _mm_unpacklo_epi8(nearb, zero); + __m128i diff = _mm_sub_epi16(farw, nearw); + __m128i nears = _mm_slli_epi16(nearw, 2); + __m128i curr = _mm_add_epi16(nears, diff); /* current row */ + + /* horizontal filter works the same based on shifted vers of current + * row. "prev" is current row shifted right by 1 pixel; we need to + * insert the previous pixel value (from t1). + * "next" is current row shifted left by 1 pixel, with first pixel + * of next block of 8 pixels added in. + */ + __m128i prv0 = _mm_slli_si128(curr, 2); + __m128i nxt0 = _mm_srli_si128(curr, 2); + __m128i prev = _mm_insert_epi16(prv0, t1, 0); + __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7); + + /* horizontal filter, polyphase implementation since it's convenient: + * even pixels = 3*cur + prev = cur*4 + (prev - cur) + * odd pixels = 3*cur + next = cur*4 + (next - cur) + * note the shared term. */ + __m128i bias = _mm_set1_epi16(8); + __m128i curs = _mm_slli_epi16(curr, 2); + __m128i prvd = _mm_sub_epi16(prev, curr); + __m128i nxtd = _mm_sub_epi16(next, curr); + __m128i curb = _mm_add_epi16(curs, bias); + __m128i even = _mm_add_epi16(prvd, curb); + __m128i odd = _mm_add_epi16(nxtd, curb); + + /* interleave even and odd pixels, then undo scaling. */ + __m128i int0 = _mm_unpacklo_epi16(even, odd); + __m128i int1 = _mm_unpackhi_epi16(even, odd); + __m128i de0 = _mm_srli_epi16(int0, 4); + __m128i de1 = _mm_srli_epi16(int1, 4); + + /* pack and write output */ + __m128i outv = _mm_packus_epi16(de0, de1); + _mm_storeu_si128((__m128i *) (out + i*2), outv); +#elif defined(STBI_NEON) + // load and perform the vertical filtering pass + // this uses 3*x + y = 4*x + (y - x) + uint8x8_t farb = vld1_u8(in_far + i); + uint8x8_t nearb = vld1_u8(in_near + i); + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); + int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); + int16x8_t curr = vaddq_s16(nears, diff); // current row + + // horizontal filter works the same based on shifted vers of current + // row. "prev" is current row shifted right by 1 pixel; we need to + // insert the previous pixel value (from t1). + // "next" is current row shifted left by 1 pixel, with first pixel + // of next block of 8 pixels added in. + int16x8_t prv0 = vextq_s16(curr, curr, 7); + int16x8_t nxt0 = vextq_s16(curr, curr, 1); + int16x8_t prev = vsetq_lane_s16(t1, prv0, 0); + int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7); + + /* horizontal filter, polyphase implementation since it's convenient: + * even pixels = 3*cur + prev = cur*4 + (prev - cur) + * odd pixels = 3*cur + next = cur*4 + (next - cur) + * note the shared term. + */ + int16x8_t curs = vshlq_n_s16(curr, 2); + int16x8_t prvd = vsubq_s16(prev, curr); + int16x8_t nxtd = vsubq_s16(next, curr); + int16x8_t even = vaddq_s16(curs, prvd); + int16x8_t odd = vaddq_s16(curs, nxtd); + + /* undo scaling and round, then store with even/odd phases interleaved */ + uint8x8x2_t o; + o.val[0] = vqrshrun_n_s16(even, 4); + o.val[1] = vqrshrun_n_s16(odd, 4); + vst2_u8(out + i*2, o); +#endif + + /* "previous" value for next iteration */ + t1 = 3*in_near[i+7] + in_far[i+7]; + } + + t0 = t1; + t1 = 3*in_near[i] + in_far[i]; + out[i*2] = stbi__div16(3*t1 + t0 + 8); + + for (++i; i < w; ++i) { + t0 = t1; + t1 = 3*in_near[i]+in_far[i]; + out[i*2-1] = stbi__div16(3*t0 + t1 + 8); + out[i*2 ] = stbi__div16(3*t1 + t0 + 8); + } + out[w*2-1] = stbi__div4(t1+2); + + STBI_NOTUSED(hs); + + return out; +} +#endif + +static uint8_t *stbi__resample_row_generic(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) +{ + /* resample with nearest-neighbor */ + int i,j; + STBI_NOTUSED(in_far); + for (i=0; i < w; ++i) + for (j=0; j < hs; ++j) + out[i*hs+j] = in_near[i]; + return out; +} + +#ifdef STBI_JPEG_OLD +/* this is the same YCbCr-to-RGB calculation that stb_image has used + * historically before the algorithm changes in 1.49 */ +#define float2fixed(x) ((int) ((x) * 65536 + 0.5)) +static void stbi__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step) +{ + int i; + for (i=0; i < count; ++i) { + int y_fixed = (y[i] << 16) + 32768; // rounding + int r,g,b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr*float2fixed(1.40200f); + g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f); + b = y_fixed + cb*float2fixed(1.77200f); + r >>= 16; + g >>= 16; + b >>= 16; + if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (uint8_t)r; + out[1] = (uint8_t)g; + out[2] = (uint8_t)b; + out[3] = 255; + out += step; + } +} +#else +/* this is a reduced-precision calculation of YCbCr-to-RGB introduced + * to make sure the code produces the same results in both SIMD and scalar */ +#define float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) +static void stbi__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step) +{ + int i; + for (i=0; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1<<19); /* rounding */ + int r,g,b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr* float2fixed(1.40200f); + g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb* float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (uint8_t)r; + out[1] = (uint8_t)g; + out[2] = (uint8_t)b; + out[3] = 255; + out += step; + } +} +#endif + +#if defined(STBI_SSE2) || defined(STBI_NEON) +static void stbi__YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step) +{ + int i = 0; + +#ifdef STBI_SSE2 + /* step == 3 is pretty ugly on the final interleave, and i'm not convinced + * it's useful in practice (you wouldn't use it for textures, for example). + * so just accelerate step == 4 case. + */ + if (step == 4) + { + /* this is a fairly straightforward implementation and not super-optimized. */ + __m128i signflip = _mm_set1_epi8(-0x80); + __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f)); + __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f)); + __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f)); + __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f)); + __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128); + __m128i xw = _mm_set1_epi16(255); /* alpha channel */ + + for (; i+7 < count; i += 8) + { + // load + __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i)); + __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i)); + __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i)); + __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 + __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 + + // unpack to short (and left-shift cr, cb by 8) + __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); + __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); + __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); + + // color transform + __m128i yws = _mm_srli_epi16(yw, 4); + __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); + __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); + __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); + __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); + __m128i rws = _mm_add_epi16(cr0, yws); + __m128i gwt = _mm_add_epi16(cb0, yws); + __m128i bws = _mm_add_epi16(yws, cb1); + __m128i gws = _mm_add_epi16(gwt, cr1); + + // descale + __m128i rw = _mm_srai_epi16(rws, 4); + __m128i bw = _mm_srai_epi16(bws, 4); + __m128i gw = _mm_srai_epi16(gws, 4); + + // back to byte, set up for transpose + __m128i brb = _mm_packus_epi16(rw, bw); + __m128i gxb = _mm_packus_epi16(gw, xw); + + // transpose to interleave channels + __m128i t0 = _mm_unpacklo_epi8(brb, gxb); + __m128i t1 = _mm_unpackhi_epi8(brb, gxb); + __m128i o0 = _mm_unpacklo_epi16(t0, t1); + __m128i o1 = _mm_unpackhi_epi16(t0, t1); + + // store + _mm_storeu_si128((__m128i *) (out + 0), o0); + _mm_storeu_si128((__m128i *) (out + 16), o1); + out += 32; + } + } +#endif + +#ifdef STBI_NEON + // in this version, step=3 support would be easy to add. but is there demand? + if (step == 4) { + // this is a fairly straightforward implementation and not super-optimized. + uint8x8_t signflip = vdup_n_u8(0x80); + int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f)); + int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f)); + int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f)); + int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f)); + + for (; i+7 < count; i += 8) { + // load + uint8x8_t y_bytes = vld1_u8(y + i); + uint8x8_t cr_bytes = vld1_u8(pcr + i); + uint8x8_t cb_bytes = vld1_u8(pcb + i); + int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip)); + int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip)); + + // expand to s16 + int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4)); + int16x8_t crw = vshll_n_s8(cr_biased, 7); + int16x8_t cbw = vshll_n_s8(cb_biased, 7); + + // color transform + int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0); + int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0); + int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1); + int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1); + int16x8_t rws = vaddq_s16(yws, cr0); + int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1); + int16x8_t bws = vaddq_s16(yws, cb1); + + // undo scaling, round, convert to byte + uint8x8x4_t o; + o.val[0] = vqrshrun_n_s16(rws, 4); + o.val[1] = vqrshrun_n_s16(gws, 4); + o.val[2] = vqrshrun_n_s16(bws, 4); + o.val[3] = vdup_n_u8(255); + + // store, interleaving r/g/b/a + vst4_u8(out, o); + out += 8*4; + } + } +#endif + + for (; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1<<19); // rounding + int r,g,b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr* float2fixed(1.40200f); + g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb* float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (uint8_t)r; + out[1] = (uint8_t)g; + out[2] = (uint8_t)b; + out[3] = 255; + out += step; + } +} +#endif + +/* set up the kernels */ +static void stbi__setup_jpeg(stbi__jpeg *j) +{ + j->idct_block_kernel = stbi__idct_block; + j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row; + j->resample_row_hv_2_kernel = stbi__resample_row_hv_2; + +#ifdef STBI_SSE2 + if (stbi__sse2_available()) { + j->idct_block_kernel = stbi__idct_simd; + #ifndef STBI_JPEG_OLD + j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; + #endif + j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; + } +#endif + +#ifdef STBI_NEON + j->idct_block_kernel = stbi__idct_simd; + #ifndef STBI_JPEG_OLD + j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; + #endif + j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; +#endif +} + +/* clean up the temporary component buffers */ +static void stbi__cleanup_jpeg(stbi__jpeg *j) +{ + int i; + for (i=0; i < j->s->img_n; ++i) { + if (j->img_comp[i].raw_data) { + free(j->img_comp[i].raw_data); + j->img_comp[i].raw_data = NULL; + j->img_comp[i].data = NULL; + } + if (j->img_comp[i].raw_coeff) { + free(j->img_comp[i].raw_coeff); + j->img_comp[i].raw_coeff = 0; + j->img_comp[i].coeff = 0; + } + if (j->img_comp[i].linebuf) { + free(j->img_comp[i].linebuf); + j->img_comp[i].linebuf = NULL; + } + } +} + +typedef struct +{ + resample_row_func resample; + uint8_t *line0,*line1; + int hs,vs; // expansion factor in each axis + int w_lores; // horizontal pixels pre-expansion + int ystep; // how far through vertical expansion we are + int ypos; // which pre-expansion row we're on +} stbi__resample; + +static uint8_t *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp) +{ + int n, decode_n; + z->s->img_n = 0; // make stbi__cleanup_jpeg safe + + // validate req_comp + if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); + + // load a jpeg image from whichever source, but leave in YCbCr format + if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; } + + // determine actual number of components to generate + n = req_comp ? req_comp : z->s->img_n; + + if (z->s->img_n == 3 && n < 3) + decode_n = 1; + else + decode_n = z->s->img_n; + + // resample and color-convert + { + int k; + unsigned int i,j; + uint8_t *output; + uint8_t *coutput[4]; + + stbi__resample res_comp[4]; + + for (k=0; k < decode_n; ++k) { + stbi__resample *r = &res_comp[k]; + + // allocate line buffer big enough for upsampling off the edges + // with upsample factor of 4 + z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3); + if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } + + r->hs = z->img_h_max / z->img_comp[k].h; + r->vs = z->img_v_max / z->img_comp[k].v; + r->ystep = r->vs >> 1; + r->w_lores = (z->s->img_x + r->hs-1) / r->hs; + r->ypos = 0; + r->line0 = r->line1 = z->img_comp[k].data; + + if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1; + else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2; + else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2; + else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel; + else r->resample = stbi__resample_row_generic; + } + + // can't error after this so, this is safe + output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1); + if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } + + // now go ahead and resample + for (j=0; j < z->s->img_y; ++j) { + uint8_t *out = output + n * z->s->img_x * j; + for (k=0; k < decode_n; ++k) { + stbi__resample *r = &res_comp[k]; + int y_bot = r->ystep >= (r->vs >> 1); + coutput[k] = r->resample(z->img_comp[k].linebuf, + y_bot ? r->line1 : r->line0, + y_bot ? r->line0 : r->line1, + r->w_lores, r->hs); + if (++r->ystep >= r->vs) { + r->ystep = 0; + r->line0 = r->line1; + if (++r->ypos < z->img_comp[k].y) + r->line1 += z->img_comp[k].w2; + } + } + if (n >= 3) { + uint8_t *y = coutput[0]; + if (z->s->img_n == 3) { + z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); + } else + for (i=0; i < z->s->img_x; ++i) { + out[0] = out[1] = out[2] = y[i]; + out[3] = 255; // not used if n==3 + out += n; + } + } else { + uint8_t *y = coutput[0]; + if (n == 1) + for (i=0; i < z->s->img_x; ++i) out[i] = y[i]; + else + for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255; + } + } + stbi__cleanup_jpeg(z); + *out_x = z->s->img_x; + *out_y = z->s->img_y; + if (comp) *comp = z->s->img_n; // report original components, not output + return output; + } +} + +static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp) +{ + stbi__jpeg j; + j.s = s; + stbi__setup_jpeg(&j); + return load_jpeg_image(&j, x,y,comp,req_comp); +} + +static int stbi__jpeg_test(stbi__context *s) +{ + int r; + stbi__jpeg j; + j.s = s; + stbi__setup_jpeg(&j); + r = stbi__decode_jpeg_header(&j, STBI__SCAN_type); + stbi__rewind(s); + return r; +} +#endif + +bool rjpeg_image_load(uint8_t *_buf, void *data, size_t size, unsigned a_shift, unsigned r_shift, unsigned g_shift, unsigned b_shift) { - struct rjpeg_data *rjpg = rjpeg_new(buf, size); + unsigned i; + int x, y, comp; struct texture_image *out_img = (struct texture_image*)data; - if (!rjpg) - goto error; + out_img->pixels = stbi_load_from_memory(_buf, size, &x, &y, &comp, 4); - out_img->width = rjpg->width; - out_img->height = rjpg->height; - out_img->pixels = (uint32_t*)malloc(rjpg->width * rjpg->height * rjpg->ncomp); + out_img->width = x; + out_img->height = y; - if (!out_img->pixels) - goto error; - - if (rjpg->ncomp == 3) +#if 0 + for (i = 0; i < (x * y); i++) { - /* convert to RGB */ - int x, yy; - uint32_t *prgb = (uint32_t*)out_img->pixels; - const unsigned char *py = rjpg->comp[0].pixels; - const unsigned char *pcb = rjpg->comp[1].pixels; - const unsigned char *pcr = rjpg->comp[2].pixels; + uint32_t r = (_buf[i] & 0xff00ff00); + uint32_t g = ((_buf[i] << 16) & 0x00ff0000); + uint32_t b = ((_buf[i] >> 16) & 0xff); - for (yy = rjpg->height; yy; --yy) - { - for (x = 0; x < rjpg->width; ++x) - { - int y = py[x] << 8; - int cb = pcb[x] - 128; - int cr = pcr[x] - 128; - *prgb++ = rjpeg_clip((y + 359 * cr + 128) >> 8); - *prgb++ = rjpeg_clip((y - 88 * cb - 183 * cr + 128) >> 8); - *prgb++ = rjpeg_clip((y + 454 * cb + 128) >> 8); - } - py += rjpg->comp[0].stride; - pcb += rjpg->comp[1].stride; - pcr += rjpg->comp[2].stride; - } + if (r_shift == 0 && b_shift == 16) + out_img->pixels[i] = _buf[i]; + else + out_img->pixels[i] = r | g | b; + //out_img->pixels[i] = (r << r_shift) | (g << g_shift) || (b << b_shift); } - - rjpeg_free(rjpg); +#endif return true; - -error: - if (out_img->pixels) - free(out_img->pixels); - - out_img->pixels = NULL; - out_img->width = out_img->height = 0; - - if (rjpg) - rjpeg_free(rjpg); - return false; }