StringUtil: Add UTF-16 encoding/decoding functions

This commit is contained in:
Stenzek 2024-12-08 02:56:52 +10:00
parent 7f3687de81
commit 2999f15d7a
No known key found for this signature in database
2 changed files with 159 additions and 6 deletions

View File

@ -7,6 +7,7 @@
#include <cctype>
#include <codecvt>
#include <cstdio>
#include <memory>
#include <sstream>
#ifndef __APPLE__
@ -442,7 +443,7 @@ bool StringUtil::ParseAssignmentString(const std::string_view str, std::string_v
void StringUtil::EncodeAndAppendUTF8(std::string& s, char32_t ch)
{
if (ch <= 0x7F)
if (ch <= 0x7F) [[likely]]
{
s.push_back(static_cast<char>(static_cast<u8>(ch)));
}
@ -472,17 +473,84 @@ void StringUtil::EncodeAndAppendUTF8(std::string& s, char32_t ch)
}
}
size_t StringUtil::GetEncodedUTF8Length(char32_t ch)
{
if (ch <= 0x7F) [[likely]]
return 1;
else if (ch <= 0x07FF)
return 2;
else if (ch <= 0xFFFF)
return 3;
else if (ch <= 0x10FFFF)
return 4;
else
return 3;
}
size_t StringUtil::EncodeAndAppendUTF8(void* utf8, size_t pos, size_t size, char32_t ch)
{
u8* utf8_bytes = static_cast<u8*>(utf8) + pos;
if (ch <= 0x7F) [[likely]]
{
if (pos == size) [[unlikely]]
return 0;
utf8_bytes[0] = static_cast<u8>(ch);
return 1;
}
else if (ch <= 0x07FF)
{
if ((pos + 1) >= size) [[unlikely]]
return 0;
utf8_bytes[0] = static_cast<u8>(0xc0 | static_cast<u8>((ch >> 6) & 0x1f));
utf8_bytes[1] = static_cast<u8>(0x80 | static_cast<u8>((ch & 0x3f)));
return 2;
}
else if (ch <= 0xFFFF)
{
if ((pos + 3) >= size) [[unlikely]]
return 0;
utf8_bytes[0] = static_cast<u8>(0xe0 | static_cast<u8>(((ch >> 12) & 0x0f)));
utf8_bytes[1] = static_cast<u8>(0x80 | static_cast<u8>(((ch >> 6) & 0x3f)));
utf8_bytes[2] = static_cast<u8>(0x80 | static_cast<u8>((ch & 0x3f)));
return 3;
}
else if (ch <= 0x10FFFF)
{
if ((pos + 4) >= size) [[unlikely]]
return 0;
utf8_bytes[0] = static_cast<u8>(0xf0 | static_cast<u8>(((ch >> 18) & 0x07)));
utf8_bytes[1] = static_cast<u8>(0x80 | static_cast<u8>(((ch >> 12) & 0x3f)));
utf8_bytes[2] = static_cast<u8>(0x80 | static_cast<u8>(((ch >> 6) & 0x3f)));
utf8_bytes[3] = static_cast<u8>(0x80 | static_cast<u8>((ch & 0x3f)));
return 4;
}
else
{
if ((pos + 3) >= size) [[unlikely]]
return 0;
utf8_bytes[0] = 0xefu;
utf8_bytes[1] = 0xbfu;
utf8_bytes[2] = 0xbdu;
return 3;
}
}
size_t StringUtil::DecodeUTF8(const void* bytes, size_t length, char32_t* ch)
{
const u8* s = reinterpret_cast<const u8*>(bytes);
if (s[0] < 0x80)
if (s[0] < 0x80) [[likely]]
{
*ch = s[0];
return 1;
}
else if ((s[0] & 0xe0) == 0xc0)
{
if (length < 2)
if (length < 2) [[unlikely]]
goto invalid;
*ch = static_cast<char32_t>((static_cast<u32>(s[0] & 0x1f) << 6) | (static_cast<u32>(s[1] & 0x3f) << 0));
@ -490,7 +558,7 @@ size_t StringUtil::DecodeUTF8(const void* bytes, size_t length, char32_t* ch)
}
else if ((s[0] & 0xf0) == 0xe0)
{
if (length < 3)
if (length < 3) [[unlikely]]
goto invalid;
*ch = static_cast<char32_t>((static_cast<u32>(s[0] & 0x0f) << 12) | (static_cast<u32>(s[1] & 0x3f) << 6) |
@ -499,7 +567,7 @@ size_t StringUtil::DecodeUTF8(const void* bytes, size_t length, char32_t* ch)
}
else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4))
{
if (length < 4)
if (length < 4) [[unlikely]]
goto invalid;
*ch = static_cast<char32_t>((static_cast<u32>(s[0] & 0x07) << 18) | (static_cast<u32>(s[1] & 0x3f) << 12) |
@ -512,6 +580,82 @@ invalid:
return 1;
}
size_t StringUtil::EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, char32_t codepoint)
{
u8* const utf16_bytes = std::assume_aligned<sizeof(u16)>(static_cast<u8*>(utf16)) + (pos * sizeof(u16));
if (codepoint <= 0xFFFF) [[likely]]
{
if (pos == size) [[unlikely]]
return 0;
// surrogates are invalid
const u16 codepoint16 =
static_cast<u16>((codepoint >= 0xD800 && codepoint <= 0xDFFF) ? UNICODE_REPLACEMENT_CHARACTER : codepoint);
std::memcpy(utf16_bytes, &codepoint16, sizeof(codepoint16));
return 1;
}
else if (codepoint <= 0x10FFFF)
{
if ((pos + 1) >= size) [[unlikely]]
return 0;
codepoint -= 0x010000;
const u16 low = static_cast<u16>(((static_cast<u32>(codepoint) >> 10) & 0x3FFu) + 0xD800);
const u16 high = static_cast<u16>((static_cast<u32>(codepoint) & 0x3FFu) + 0xDC00);
std::memcpy(utf16_bytes, &low, sizeof(high));
std::memcpy(utf16_bytes + sizeof(u16), &high, sizeof(high));
return 2;
}
else
{
// unrepresentable
constexpr u16 value = static_cast<u16>(UNICODE_REPLACEMENT_CHARACTER);
std::memcpy(utf16_bytes, &value, sizeof(value));
return 1;
}
}
size_t StringUtil::DecodeUTF16(const void* bytes, size_t pos, size_t length, char32_t* ch)
{
const u8* const utf16_bytes = std::assume_aligned<sizeof(u16)>(static_cast<const u8*>(bytes)) + pos * sizeof(u16);
u16 high;
std::memcpy(&high, utf16_bytes, sizeof(high));
// High surrogate?
if (high >= 0xD800 && high <= 0xDBFF) [[unlikely]]
{
if (length < 2) [[unlikely]]
{
// Missing low surrogate.
*ch = UNICODE_REPLACEMENT_CHARACTER;
return 1;
}
u16 low;
std::memcpy(&low, utf16_bytes + sizeof(u16), sizeof(low));
if (low >= 0xDC00 && low <= 0xDFFF) [[likely]]
{
*ch = static_cast<char32_t>(((static_cast<u32>(high) - 0xD800u) << 10) + ((static_cast<u32>(low) - 0xDC00)) +
0x10000u);
return 2;
}
else
{
// Invalid high surrogate.
*ch = UNICODE_REPLACEMENT_CHARACTER;
return 2;
}
}
else
{
// Single 16-bit value.
*ch = static_cast<char32_t>(high);
return 1;
}
}
std::string StringUtil::Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis /*= "..."*/)
{
std::string ret;

View File

@ -361,13 +361,22 @@ static constexpr char32_t UNICODE_REPLACEMENT_CHARACTER = 0xFFFD;
/// Appends a UTF-16/UTF-32 codepoint to a UTF-8 string.
void EncodeAndAppendUTF8(std::string& s, char32_t ch);
size_t EncodeAndAppendUTF8(void* utf8, size_t pos, size_t size, char32_t ch);
size_t GetEncodedUTF8Length(char32_t ch);
/// Decodes UTF-8 to a single codepoint, updating the position parameter.
/// Decodes UTF-8 to a single unicode codepoint.
/// Returns the number of bytes the codepoint took in the original string.
size_t DecodeUTF8(const void* bytes, size_t length, char32_t* ch);
size_t DecodeUTF8(const std::string_view str, size_t offset, char32_t* ch);
size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch);
/// Appends a unicode codepoint to a UTF-16 string.
size_t EncodeAndAppendUTF16(void* utf16, size_t pos, size_t size, char32_t codepoint);
/// Decodes UTF-16 to a single unicode codepoint.
/// Returns the number of bytes the codepoint took in the original string.
size_t DecodeUTF16(const void* bytes, size_t pos, size_t size, char32_t* codepoint);
// Replaces the end of a string with ellipsis if it exceeds the specified length.
std::string Ellipsise(const std::string_view str, u32 max_length, const char* ellipsis = "...");
void EllipsiseInPlace(std::string& str, u32 max_length, const char* ellipsis = "...");