From 2999f15d7aa9e0d020044f417aa58d6e3e6ed737 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 8 Dec 2024 02:56:52 +1000 Subject: [PATCH] StringUtil: Add UTF-16 encoding/decoding functions --- src/common/string_util.cpp | 154 +++++++++++++++++++++++++++++++++++-- src/common/string_util.h | 11 ++- 2 files changed, 159 insertions(+), 6 deletions(-) diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 18cc8110d..adaa4bc42 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #ifndef __APPLE__ @@ -442,7 +443,7 @@ bool StringUtil::ParseAssignmentString(const std::string_view str, std::string_v void StringUtil::EncodeAndAppendUTF8(std::string& s, char32_t ch) { - if (ch <= 0x7F) + if (ch <= 0x7F) [[likely]] { s.push_back(static_cast(static_cast(ch))); } @@ -472,17 +473,84 @@ void StringUtil::EncodeAndAppendUTF8(std::string& s, char32_t ch) } } +size_t StringUtil::GetEncodedUTF8Length(char32_t ch) +{ + if (ch <= 0x7F) [[likely]] + return 1; + else if (ch <= 0x07FF) + return 2; + else if (ch <= 0xFFFF) + return 3; + else if (ch <= 0x10FFFF) + return 4; + else + return 3; +} + +size_t StringUtil::EncodeAndAppendUTF8(void* utf8, size_t pos, size_t size, char32_t ch) +{ + u8* utf8_bytes = static_cast(utf8) + pos; + if (ch <= 0x7F) [[likely]] + { + if (pos == size) [[unlikely]] + return 0; + + utf8_bytes[0] = static_cast(ch); + return 1; + } + else if (ch <= 0x07FF) + { + if ((pos + 1) >= size) [[unlikely]] + return 0; + + utf8_bytes[0] = static_cast(0xc0 | static_cast((ch >> 6) & 0x1f)); + utf8_bytes[1] = static_cast(0x80 | static_cast((ch & 0x3f))); + return 2; + } + else if (ch <= 0xFFFF) + { + if ((pos + 3) >= size) [[unlikely]] + return 0; + + utf8_bytes[0] = static_cast(0xe0 | static_cast(((ch >> 12) & 0x0f))); + utf8_bytes[1] = static_cast(0x80 | static_cast(((ch >> 6) & 0x3f))); + utf8_bytes[2] = static_cast(0x80 | static_cast((ch & 0x3f))); + return 3; + } + else if (ch <= 0x10FFFF) + { + if ((pos + 4) >= size) [[unlikely]] + return 0; + + utf8_bytes[0] = static_cast(0xf0 | static_cast(((ch >> 18) & 0x07))); + utf8_bytes[1] = static_cast(0x80 | static_cast(((ch >> 12) & 0x3f))); + utf8_bytes[2] = static_cast(0x80 | static_cast(((ch >> 6) & 0x3f))); + utf8_bytes[3] = static_cast(0x80 | static_cast((ch & 0x3f))); + return 4; + } + else + { + if ((pos + 3) >= size) [[unlikely]] + return 0; + + utf8_bytes[0] = 0xefu; + utf8_bytes[1] = 0xbfu; + utf8_bytes[2] = 0xbdu; + return 3; + } +} + size_t StringUtil::DecodeUTF8(const void* bytes, size_t length, char32_t* ch) { const u8* s = reinterpret_cast(bytes); - if (s[0] < 0x80) + if (s[0] < 0x80) [[likely]] { *ch = s[0]; return 1; } else if ((s[0] & 0xe0) == 0xc0) { - if (length < 2) + if (length < 2) [[unlikely]] goto invalid; *ch = static_cast((static_cast(s[0] & 0x1f) << 6) | (static_cast(s[1] & 0x3f) << 0)); @@ -490,7 +558,7 @@ size_t StringUtil::DecodeUTF8(const void* bytes, size_t length, char32_t* ch) } else if ((s[0] & 0xf0) == 0xe0) { - if (length < 3) + if (length < 3) [[unlikely]] goto invalid; *ch = static_cast((static_cast(s[0] & 0x0f) << 12) | (static_cast(s[1] & 0x3f) << 6) | @@ -499,7 +567,7 @@ size_t StringUtil::DecodeUTF8(const void* bytes, size_t length, char32_t* ch) } else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) { - if (length < 4) + if (length < 4) [[unlikely]] goto invalid; *ch = static_cast((static_cast(s[0] & 0x07) << 18) | (static_cast(s[1] & 0x3f) << 12) | @@ -512,6 +580,82 @@ invalid: return 1; } +size_t StringUtil::EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, char32_t codepoint) +{ + u8* const utf16_bytes = std::assume_aligned(static_cast(utf16)) + (pos * sizeof(u16)); + if (codepoint <= 0xFFFF) [[likely]] + { + if (pos == size) [[unlikely]] + return 0; + + // surrogates are invalid + const u16 codepoint16 = + static_cast((codepoint >= 0xD800 && codepoint <= 0xDFFF) ? UNICODE_REPLACEMENT_CHARACTER : codepoint); + std::memcpy(utf16_bytes, &codepoint16, sizeof(codepoint16)); + return 1; + } + else if (codepoint <= 0x10FFFF) + { + if ((pos + 1) >= size) [[unlikely]] + return 0; + + codepoint -= 0x010000; + + const u16 low = static_cast(((static_cast(codepoint) >> 10) & 0x3FFu) + 0xD800); + const u16 high = static_cast((static_cast(codepoint) & 0x3FFu) + 0xDC00); + std::memcpy(utf16_bytes, &low, sizeof(high)); + std::memcpy(utf16_bytes + sizeof(u16), &high, sizeof(high)); + return 2; + } + else + { + // unrepresentable + constexpr u16 value = static_cast(UNICODE_REPLACEMENT_CHARACTER); + std::memcpy(utf16_bytes, &value, sizeof(value)); + return 1; + } +} + +size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, char32_t* ch) +{ + const u8* const utf16_bytes = std::assume_aligned(static_cast(bytes)) + pos * sizeof(u16); + + u16 high; + std::memcpy(&high, utf16_bytes, sizeof(high)); + + // High surrogate? + if (high >= 0xD800 && high <= 0xDBFF) [[unlikely]] + { + if (length < 2) [[unlikely]] + { + // Missing low surrogate. + *ch = UNICODE_REPLACEMENT_CHARACTER; + return 1; + } + + u16 low; + std::memcpy(&low, utf16_bytes + sizeof(u16), sizeof(low)); + if (low >= 0xDC00 && low <= 0xDFFF) [[likely]] + { + *ch = static_cast(((static_cast(high) - 0xD800u) << 10) + ((static_cast(low) - 0xDC00)) + + 0x10000u); + return 2; + } + else + { + // Invalid high surrogate. + *ch = UNICODE_REPLACEMENT_CHARACTER; + return 2; + } + } + else + { + // Single 16-bit value. + *ch = static_cast(high); + return 1; + } +} + std::string StringUtil::Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis /*= "..."*/) { std::string ret; diff --git a/src/common/string_util.h b/src/common/string_util.h index 8c01ec17e..707632e21 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -361,13 +361,22 @@ static constexpr char32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD; /// Appends a UTF-16/UTF-32 codepoint to a UTF-8 string. void EncodeAndAppendUTF8(std::string& s, char32_t ch); +size_t EncodeAndAppendUTF8(void* utf8, size_t pos, size_t size, char32_t ch); +size_t GetEncodedUTF8Length(char32_t ch); -/// Decodes UTF-8 to a single codepoint, updating the position parameter. +/// Decodes UTF-8 to a single unicode codepoint. /// Returns the number of bytes the codepoint took in the original string. size_t DecodeUTF8(const void* bytes, size_t length, char32_t* ch); size_t DecodeUTF8(const std::string_view str, size_t offset, char32_t* ch); size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch); +/// Appends a unicode codepoint to a UTF-16 string. +size_t EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, char32_t codepoint); + +/// Decodes UTF-16 to a single unicode codepoint. +/// Returns the number of bytes the codepoint took in the original string. +size_t DecodeUTF16(const void* bytes, size_t pos, size_t size, char32_t* codepoint); + // Replaces the end of a string with ellipsis if it exceeds the specified length. std::string Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis = "..."); void EllipsiseInPlace(std::string& str, u32 max_length, const char* ellipsis = "...");