Util: Add utf8strlen, tests

2022-05-15 21:18:48 -07:00 · 2022-05-15 21:18:48 -07:00 · 155c57b19e
parent e9ba117258
commit 155c57b19e
3 changed files with 51 additions and 13 deletions
--- a/include/mgba-util/string.h
+++ b/include/mgba-util/string.h
@ -34,6 +34,7 @@ char* utf16to8(const uint16_t* utf16, size_t length);
 uint32_t utf8Char(const char** unicode, size_t* length);
 uint32_t utf16Char(const uint16_t** unicode, size_t* length);
 char* gbkToUtf8(const char* gbk, size_t length);
+size_t utf8strlen(const char* string);

 int hexDigit(char digit);
 const char* hex32(const char* line, uint32_t* out);
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@ -29,6 +29,7 @@ set(GUI_FILES

 set(TEST_FILES
 	test/string-parser.c
+	test/string-utf8.c
 	test/text-codec.c
 	test/vfs.c)

--- a/src/util/string.c
+++ b/src/util/string.c
@ -108,11 +108,30 @@ uint32_t utf16Char(const uint16_t** unicode, size_t* length) {
 	return (highSurrogate << 10) + lowSurrogate + 0x10000;
 }

+static const uint8_t _utf8len[0x40] = {
+	/* 0000 xxxx */ 1, 1, 1, 1,
+	/* 0001 xxxx */ 1, 1, 1, 1,
+	/* 0010 xxxx */ 1, 1, 1, 1,
+	/* 0011 xxxx */ 1, 1, 1, 1,
+	/* 0100 xxxx */ 1, 1, 1, 1,
+	/* 0101 xxxx */ 1, 1, 1, 1,
+	/* 0110 xxxx */ 1, 1, 1, 1,
+	/* 0111 xxxx */ 1, 1, 1, 1,
+	/* 1000 xxxx */ 0, 0, 0, 0,
+	/* 1001 xxxx */ 0, 0, 0, 0,
+	/* 1010 xxxx */ 0, 0, 0, 0,
+	/* 1011 xxxx */ 0, 0, 0, 0,
+	/* 1100 xxxx */ 2, 2, 2, 2,
+	/* 1101 xxxx */ 2, 2, 2, 2,
+	/* 1110 xxxx */ 3, 3, 3, 3,
+	/* 1111 xxxx */ 4, 4, 0, 0
+};
+
 uint32_t utf8Char(const char** unicode, size_t* length) {
 	if (*length == 0) {
 		return 0;
 	}
-	char byte = **unicode;
+	unsigned char byte = **unicode;
 	--*length;
 	++*unicode;
 	if (!(byte & 0x80)) {
@ -120,23 +139,17 @@ uint32_t utf8Char(const char** unicode, size_t* length) {
 	}
 	uint32_t unichar;
 	static const int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
-	size_t numBytes;
-	for (numBytes = 0; numBytes < 3; ++numBytes) {
-		if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
-			break;
-		}
+	size_t numBytes = _utf8len[byte >> 2];
+	unichar = byte & ~tops[numBytes - 1];
+	if (numBytes == 0) {
+		return 0xFFFD;
 	}
-	unichar = byte & ~tops[numBytes];
-	if (numBytes == 3) {
-		return 0;
-	}
-	++numBytes;
 	if (*length < numBytes) {
 		*length = 0;
-		return 0;
+		return 0xFFFD;
 	}
 	size_t i;
-	for (i = 0; i < numBytes; ++i) {
+	for (i = 1; i < numBytes; ++i) {
 		unichar <<= 6;
 		byte = **unicode;
 		--*length;
@ -341,6 +354,29 @@ char* gbkToUtf8(const char* gbk, size_t length) {
 	return newUTF8;
 }

+size_t utf8strlen(const char* string) {
+	size_t size = 0;
+	for (size = 0; *string; ++size) {
+		size_t numBytes = 1;
+		if (*string & 0x80) {
+			numBytes = _utf8len[((uint8_t) *string) >> 2];
+			if (!numBytes) {
+				numBytes = 1;
+			} else {
+				size_t i;
+				for (i = 1; i < numBytes; ++i) {
+					if ((string[i] & 0xC0) != 0x80) {
+						break;
+					}
+				}
+				numBytes = i;
+			}
+		}
+		string += numBytes;
+	}
+	return size;
+}
+
 int hexDigit(char digit) {
 	switch (digit) {
 	case '0':