339 lines
8.1 KiB
C
339 lines
8.1 KiB
C
/* UTF-8 codec
|
|
|
|
This file is part of the Public Domain C Library (PDCLib).
|
|
Permission is granted to use, modify, and / or redistribute at will.
|
|
*/
|
|
|
|
#ifndef REGTEST
|
|
#include <stdbool.h>
|
|
#include <stdint.h>
|
|
#include <uchar.h>
|
|
#include <assert.h>
|
|
#include "_PDCLIB_encoding.h"
|
|
|
|
/* Use of the mbstate:
|
|
*
|
|
* _StUC[0] is the current decoding state
|
|
* _St32[1] is the character accumulated so far
|
|
*/
|
|
|
|
static bool utf8_mbsinit( const mbstate_t *p_s )
|
|
{ return p_s->_StUC[0] == 0; }
|
|
|
|
enum {
|
|
DecStart = 0,
|
|
|
|
Dec2B2,
|
|
|
|
Dec3B2,
|
|
Dec3B3,
|
|
|
|
Dec4B2,
|
|
Dec4B3,
|
|
Dec4B4
|
|
};
|
|
|
|
#define state (p_s->_StUC[0])
|
|
#define accum (p_s->_St32[1])
|
|
|
|
#define START_CONVERSION \
|
|
bool result = true; \
|
|
|
|
#define END_CONVERSION \
|
|
end_conversion: \
|
|
return result
|
|
|
|
#define FINISH(_r) do { \
|
|
result = (_r); \
|
|
goto end_conversion; \
|
|
} while(0)
|
|
|
|
#define OUT32(_c) do { \
|
|
if(p_outbuf) \
|
|
(*((*p_outbuf)++)) = (_c); \
|
|
(*p_outsz)--; \
|
|
_PDCLIB_UNDEFINED(accum); \
|
|
state = DecStart; \
|
|
} while(0)
|
|
|
|
#define CHECK_CONTINUATION \
|
|
do { if((c & 0xC0) != 0x80) return false; } while(0)
|
|
|
|
static bool utf8toc32(
|
|
char32_t *restrict *restrict p_outbuf,
|
|
size_t *restrict p_outsz,
|
|
const char *restrict *restrict p_inbuf,
|
|
size_t *restrict p_insz,
|
|
mbstate_t *restrict p_s
|
|
)
|
|
{
|
|
START_CONVERSION
|
|
while(*p_outsz && *p_insz) {
|
|
unsigned char c = **p_inbuf;
|
|
char32_t c32;
|
|
switch(state) {
|
|
case DecStart:
|
|
// 1 byte
|
|
if(c <= 0x7F) {
|
|
OUT32(c);
|
|
} else if(c <= 0xDF) {
|
|
accum = (c & 0x1F) << 6;
|
|
state = Dec2B2;
|
|
} else if(c <= 0xEF) {
|
|
accum = (c & 0x0F) << 12;
|
|
state = Dec3B2;
|
|
} else if(c <= 0xF4) {
|
|
accum = (c & 0x07) << 18;
|
|
state = Dec4B2;
|
|
} else {
|
|
// 5+byte sequence illegal
|
|
FINISH(false);
|
|
}
|
|
break;
|
|
|
|
case Dec2B2:
|
|
CHECK_CONTINUATION;
|
|
|
|
c32 = accum | (c & 0x3F);
|
|
|
|
// Overlong sequence (e.g. NUL injection)
|
|
if(c32 <= 0x7F)
|
|
FINISH(false);
|
|
|
|
OUT32(c32);
|
|
break;
|
|
|
|
case Dec3B2:
|
|
CHECK_CONTINUATION;
|
|
accum |= (c & 0x3F) << 6;
|
|
state = Dec3B3;
|
|
break;
|
|
|
|
case Dec3B3:
|
|
CHECK_CONTINUATION;
|
|
|
|
c32 = accum | (c & 0x3F);
|
|
|
|
// Overlong
|
|
if(c32 <= 0x07FF)
|
|
FINISH(false);
|
|
|
|
// Surrogate
|
|
if(c32 >= 0xD800 && c32 <= 0xDFFF)
|
|
FINISH(false);
|
|
|
|
OUT32(c32);
|
|
break;
|
|
|
|
case Dec4B2:
|
|
CHECK_CONTINUATION;
|
|
accum |= (c & 0x3F) << 12;
|
|
state = Dec4B3;
|
|
break;
|
|
|
|
case Dec4B3:
|
|
CHECK_CONTINUATION;
|
|
accum |= (c & 0x3F) << 6;
|
|
state = Dec4B4;
|
|
break;
|
|
|
|
case Dec4B4:
|
|
CHECK_CONTINUATION;
|
|
|
|
c32 = accum | (c & 0x3F);
|
|
|
|
// Overlong
|
|
if(c32 <= 0xFFFF) FINISH(false);
|
|
|
|
// Not in Unicode
|
|
if(c32 > 0x10FFFF) FINISH(false);
|
|
|
|
OUT32(c32);
|
|
break;
|
|
|
|
default:
|
|
assert(!"Invalid state");
|
|
}
|
|
|
|
(*p_inbuf)++;
|
|
(*p_insz)--;
|
|
}
|
|
END_CONVERSION;
|
|
}
|
|
|
|
enum {
|
|
EncStart = 0,
|
|
Enc1R,
|
|
Enc2R,
|
|
Enc3R,
|
|
};
|
|
|
|
static bool c32toutf8(
|
|
char *restrict *restrict p_outbuf,
|
|
size_t *restrict p_outsz,
|
|
const char32_t *restrict *restrict p_inbuf,
|
|
size_t *restrict p_insz,
|
|
mbstate_t *restrict p_s
|
|
)
|
|
{
|
|
START_CONVERSION
|
|
while(*p_outsz) {
|
|
unsigned char outc = 0;
|
|
switch(state) {
|
|
case Enc3R:
|
|
outc = 0x80 | ((accum >> 12) & 0x3F);
|
|
state = Enc2R;
|
|
break;
|
|
|
|
case Enc2R:
|
|
outc = 0x80 | ((accum >> 6) & 0x3F);
|
|
state = Enc1R;
|
|
break;
|
|
|
|
case Enc1R:
|
|
outc = 0x80 | (accum & 0x3F);
|
|
state = EncStart;
|
|
_PDCLIB_UNDEFINED(accum);
|
|
break;
|
|
|
|
case EncStart:
|
|
if(*p_insz == 0)
|
|
FINISH(true);
|
|
|
|
accum = **p_inbuf;
|
|
(*p_inbuf)++;
|
|
(*p_insz)--;
|
|
|
|
if(accum <= 0x7F) {
|
|
outc = accum;
|
|
state = EncStart;
|
|
_PDCLIB_UNDEFINED(accum);
|
|
} else if(accum <= 0x7FF) {
|
|
outc = 0xC0 | (accum >> 6);
|
|
state = Enc1R;
|
|
} else if(accum <= 0xFFFF) {
|
|
outc = 0xE0 | (accum >> 12);
|
|
state = Enc2R;
|
|
} else if(accum <= 0x10FFFF) {
|
|
outc = 0xF0 | (accum >> 18);
|
|
state = Enc3R;
|
|
} else {
|
|
FINISH(false);
|
|
}
|
|
break;
|
|
}
|
|
|
|
if(p_outbuf) {
|
|
**p_outbuf = outc;
|
|
(*p_outbuf)++;
|
|
}
|
|
(*p_outsz)--;
|
|
}
|
|
END_CONVERSION;
|
|
}
|
|
|
|
const struct _PDCLIB_charcodec_t _PDCLIB_utf8_codec = {
|
|
.__mbsinit = utf8_mbsinit,
|
|
.__mbstoc32s = utf8toc32,
|
|
.__c32stombs = c32toutf8,
|
|
.__mb_max = 4,
|
|
};
|
|
|
|
#endif
|
|
|
|
#ifdef TEST
|
|
#include "_PDCLIB_test.h"
|
|
|
|
int main( void )
|
|
{
|
|
#ifndef REGTEST
|
|
// Valid conversion & back
|
|
|
|
static const char* input = "abcde" "\xDF\xBF" "\xEF\xBF\xBF"
|
|
"\xF4\x8F\xBF\xBF";
|
|
|
|
char32_t c32out[8];
|
|
|
|
char32_t *c32ptr = &c32out[0];
|
|
size_t c32rem = 8;
|
|
const char *chrptr = (char*) &input[0];
|
|
size_t chrrem = strlen(input);
|
|
mbstate_t mbs = { 0 };
|
|
|
|
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
|
|
TESTCASE(c32rem == 0);
|
|
TESTCASE(chrrem == 0);
|
|
TESTCASE(c32ptr == &c32out[8]);
|
|
TESTCASE(chrptr == &input[strlen(input)]);
|
|
TESTCASE(c32out[0] == 'a' && c32out[1] == 'b' && c32out[2] == 'c' &&
|
|
c32out[3] == 'd' && c32out[4] == 'e' && c32out[5] == 0x7FF &&
|
|
c32out[6] == 0xFFFF && c32out[7] == 0x10FFFF);
|
|
|
|
char chrout[strlen(input)];
|
|
c32ptr = &c32out[0];
|
|
c32rem = 8;
|
|
chrptr = &chrout[0];
|
|
chrrem = strlen(input);
|
|
TESTCASE(c32toutf8(&chrptr, &chrrem, &c32ptr, &c32rem, &mbs));
|
|
TESTCASE(c32rem == 0);
|
|
TESTCASE(chrrem == 0);
|
|
TESTCASE(c32ptr == &c32out[8]);
|
|
TESTCASE(chrptr == &chrout[strlen(input)]);
|
|
TESTCASE(memcmp(chrout, input, strlen(input)) == 0);
|
|
|
|
// Multi-part conversion
|
|
static const char* mpinput = "\xDF\xBF";
|
|
c32ptr = &c32out[0];
|
|
c32rem = 8;
|
|
chrptr = &mpinput[0];
|
|
chrrem = 1;
|
|
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
|
|
TESTCASE(c32ptr == &c32out[0]);
|
|
TESTCASE(c32rem == 8);
|
|
TESTCASE(chrptr == &mpinput[1]);
|
|
TESTCASE(chrrem == 0);
|
|
chrrem = 1;
|
|
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs));
|
|
TESTCASE(c32ptr == &c32out[1]);
|
|
TESTCASE(c32rem == 7);
|
|
TESTCASE(chrptr == &mpinput[2]);
|
|
TESTCASE(chrrem == 0);
|
|
|
|
// Invalid conversions
|
|
|
|
// Overlong nuls
|
|
const char* nul2 = "\xC0\x80";
|
|
c32ptr = &c32out[0];
|
|
c32rem = 8;
|
|
chrptr = &nul2[0];
|
|
chrrem = 2;
|
|
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
|
|
memset(&mbs, 0, sizeof mbs);
|
|
const char* nul3 = "\xE0\x80\x80";
|
|
c32ptr = &c32out[0];
|
|
c32rem = 8;
|
|
chrptr = &nul3[0];
|
|
chrrem = 3;
|
|
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
|
|
memset(&mbs, 0, sizeof mbs);
|
|
const char* nul4 = "\xF0\x80\x80\x80";
|
|
c32ptr = &c32out[0];
|
|
c32rem = 8;
|
|
chrptr = &nul4[0];
|
|
chrrem = 4;
|
|
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
|
|
|
|
// Starting on a continuation
|
|
const char* cont = "\x80";
|
|
c32ptr = &c32out[0];
|
|
c32rem = 8;
|
|
chrptr = &cont[0];
|
|
chrrem = 1;
|
|
TESTCASE(utf8toc32(&c32ptr, &c32rem, &chrptr, &chrrem, &mbs) == false);
|
|
#endif
|
|
return TEST_RESULTS;
|
|
}
|
|
|
|
#endif
|
|
|