From 22ed71cef431a2442abff8b4a1439d419f3bcab6 Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sat, 17 Sep 2022 17:44:27 +1000 Subject: [PATCH] StringUtil: Add UTF-8 encode and decode --- common/StringUtil.cpp | 82 +++++++++++++++++++++++++++++++++++++++++++ common/StringUtil.h | 11 +++++- 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/common/StringUtil.cpp b/common/StringUtil.cpp index b5388aff85..ad9cfc1e25 100644 --- a/common/StringUtil.cpp +++ b/common/StringUtil.cpp @@ -370,6 +370,88 @@ namespace StringUtil } } + void EncodeAndAppendUTF8(std::string& s, char32_t ch) + { + if (ch <= 0x7F) + { + s.push_back(static_cast(static_cast(ch))); + } + else if (ch <= 0x07FF) + { + s.push_back(static_cast(static_cast(0xc0 | static_cast((ch >> 6) & 0x1f)))); + s.push_back(static_cast(static_cast(0x80 | static_cast((ch & 0x3f))))); + } + else if (ch <= 0xFFFF) + { + s.push_back(static_cast(static_cast(0xe0 | static_cast(((ch >> 12) & 0x0f))))); + s.push_back(static_cast(static_cast(0x80 | static_cast(((ch >> 6) & 0x3f))))); + s.push_back(static_cast(static_cast(0x80 | static_cast((ch & 0x3f))))); + } + else if (ch <= 0x10FFFF) + { + s.push_back(static_cast(static_cast(0xf0 | static_cast(((ch >> 18) & 0x07))))); + s.push_back(static_cast(static_cast(0x80 | static_cast(((ch >> 12) & 0x3f))))); + s.push_back(static_cast(static_cast(0x80 | static_cast(((ch >> 6) & 0x3f))))); + s.push_back(static_cast(static_cast(0x80 | static_cast((ch & 0x3f))))); + } + else + { + s.push_back(static_cast(0xefu)); + s.push_back(static_cast(0xbfu)); + s.push_back(static_cast(0xbdu)); + } + } + + size_t DecodeUTF8(const void* bytes, size_t length, char32_t* ch) + { + const u8* s = reinterpret_cast(bytes); + if (s[0] < 0x80) + { + *ch = s[0]; + return 1; + } + else if ((s[0] & 0xe0) == 0xc0) + { + if (length < 2) + goto invalid; + + *ch = static_cast((static_cast(s[0] & 0x1f) << 6) | (static_cast(s[1] & 0x3f) << 0)); + return 2; + } + else if ((s[0] & 0xf0) == 0xe0) + { + if (length < 3) + goto invalid; + + *ch = static_cast((static_cast(s[0] & 0x0f) << 12) | (static_cast(s[1] & 0x3f) << 6) | + (static_cast(s[2] & 0x3f) << 0)); + return 3; + } + else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) + { + if (length < 4) + goto invalid; + + *ch = static_cast((static_cast(s[0] & 0x07) << 18) | (static_cast(s[1] & 0x3f) << 12) | + (static_cast(s[2] & 0x3f) << 6) | (static_cast(s[3] & 0x3f) << 0)); + return 4; + } + + invalid: + *ch = 0xFFFFFFFFu; + return 1; + } + + size_t DecodeUTF8(const std::string_view& str, size_t offset, char32_t* ch) + { + return DecodeUTF8(str.data() + offset, str.length() - offset, ch); + } + + size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch) + { + return DecodeUTF8(str.data() + offset, str.length() - offset, ch); + } + #ifdef _WIN32 std::wstring UTF8StringToWideString(const std::string_view& str) { diff --git a/common/StringUtil.h b/common/StringUtil.h index f2b684b0d7..7bc4dda056 100644 --- a/common/StringUtil.h +++ b/common/StringUtil.h @@ -169,7 +169,7 @@ namespace StringUtil std::vector SplitString(const std::string_view& str, char delimiter, bool skip_empty = true); /// Joins a string together using the specified delimiter. - template + template static inline std::string JoinString(const T& start, const T& end, char delimiter) { std::string ret; @@ -204,6 +204,15 @@ namespace StringUtil /// Appends a UTF-16/UTF-32 codepoint to a UTF-8 string. void AppendUTF16CharacterToUTF8(std::string& s, u16 ch); + /// Appends a UTF-16/UTF-32 codepoint to a UTF-8 string. + void EncodeAndAppendUTF8(std::string& s, char32_t ch); + + /// Decodes UTF-8 to a single codepoint, updating the position parameter. + /// Returns the number of bytes the codepoint took in the original string. + size_t DecodeUTF8(const void* bytes, size_t length, char32_t* ch); + size_t DecodeUTF8(const std::string_view& str, size_t offset, char32_t* ch); + size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch); + /// Strided memcpy/memcmp. static inline void StrideMemCpy(void* dst, std::size_t dst_stride, const void* src, std::size_t src_stride, std::size_t copy_size, std::size_t count)