diff --git a/include/mgba-util/string.h b/include/mgba-util/string.h index 476b39848..cc7169bf2 100644 --- a/include/mgba-util/string.h +++ b/include/mgba-util/string.h @@ -34,6 +34,7 @@ char* utf16to8(const uint16_t* utf16, size_t length); uint32_t utf8Char(const char** unicode, size_t* length); uint32_t utf16Char(const uint16_t** unicode, size_t* length); char* gbkToUtf8(const char* gbk, size_t length); +size_t utf8strlen(const char* string); int hexDigit(char digit); const char* hex32(const char* line, uint32_t* out); diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 26fbf690a..d43f901f7 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -29,6 +29,7 @@ set(GUI_FILES set(TEST_FILES test/string-parser.c + test/string-utf8.c test/text-codec.c test/vfs.c) diff --git a/src/util/string.c b/src/util/string.c index d9dc701b1..a561a5fb1 100644 --- a/src/util/string.c +++ b/src/util/string.c @@ -108,11 +108,30 @@ uint32_t utf16Char(const uint16_t** unicode, size_t* length) { return (highSurrogate << 10) + lowSurrogate + 0x10000; } +static const uint8_t _utf8len[0x40] = { + /* 0000 xxxx */ 1, 1, 1, 1, + /* 0001 xxxx */ 1, 1, 1, 1, + /* 0010 xxxx */ 1, 1, 1, 1, + /* 0011 xxxx */ 1, 1, 1, 1, + /* 0100 xxxx */ 1, 1, 1, 1, + /* 0101 xxxx */ 1, 1, 1, 1, + /* 0110 xxxx */ 1, 1, 1, 1, + /* 0111 xxxx */ 1, 1, 1, 1, + /* 1000 xxxx */ 0, 0, 0, 0, + /* 1001 xxxx */ 0, 0, 0, 0, + /* 1010 xxxx */ 0, 0, 0, 0, + /* 1011 xxxx */ 0, 0, 0, 0, + /* 1100 xxxx */ 2, 2, 2, 2, + /* 1101 xxxx */ 2, 2, 2, 2, + /* 1110 xxxx */ 3, 3, 3, 3, + /* 1111 xxxx */ 4, 4, 0, 0 +}; + uint32_t utf8Char(const char** unicode, size_t* length) { if (*length == 0) { return 0; } - char byte = **unicode; + unsigned char byte = **unicode; --*length; ++*unicode; if (!(byte & 0x80)) { @@ -120,23 +139,17 @@ uint32_t utf8Char(const char** unicode, size_t* length) { } uint32_t unichar; static const int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 }; - size_t numBytes; - for (numBytes = 0; numBytes < 3; ++numBytes) { - if ((byte & tops[numBytes + 1]) == tops[numBytes]) { - break; - } + size_t numBytes = _utf8len[byte >> 2]; + unichar = byte & ~tops[numBytes - 1]; + if (numBytes == 0) { + return 0xFFFD; } - unichar = byte & ~tops[numBytes]; - if (numBytes == 3) { - return 0; - } - ++numBytes; if (*length < numBytes) { *length = 0; - return 0; + return 0xFFFD; } size_t i; - for (i = 0; i < numBytes; ++i) { + for (i = 1; i < numBytes; ++i) { unichar <<= 6; byte = **unicode; --*length; @@ -341,6 +354,29 @@ char* gbkToUtf8(const char* gbk, size_t length) { return newUTF8; } +size_t utf8strlen(const char* string) { + size_t size = 0; + for (size = 0; *string; ++size) { + size_t numBytes = 1; + if (*string & 0x80) { + numBytes = _utf8len[((uint8_t) *string) >> 2]; + if (!numBytes) { + numBytes = 1; + } else { + size_t i; + for (i = 1; i < numBytes; ++i) { + if ((string[i] & 0xC0) != 0x80) { + break; + } + } + numBytes = i; + } + } + string += numBytes; + } + return size; +} + int hexDigit(char digit) { switch (digit) { case '0':