overlays/util: Add support for glyph set lowering when mapping utf8 to ascii8

- Lower fullwidth glyphs to halfwidth counterparts - Lower CJK punctuation glyphs - Lower general punctuation glyphs
2019-02-03 14:32:40 +03:00 · 2019-02-03 14:32:40 +03:00 · 67cdec577f
parent a36d3af3b4
commit 67cdec577f
1 changed files with 93 additions and 6 deletions
--- a/rpcs3/Emu/RSX/Overlays/overlays.cpp
+++ b/rpcs3/Emu/RSX/Overlays/overlays.cpp
@ -2,12 +2,67 @@
 #include "overlays.h"
 #include "../GSRender.h"

+static auto s_ascii_lowering_map = []()
+{
+	std::unordered_map<u32, u8> _map;
+
+	// Fullwidth block (FF00-FF5E)
+	for (u32 u = 0xFF01, c = 0x21; u <= 0xFF5E; ++u, ++c)
+	{
+		_map[u] = u8(c);
+	}
+
+	// Em and En space variations (General Punctuation)
+	for (u32 u = 0x2000; u <= 0x200A; ++u)
+	{
+		_map[u] = u8(' ');
+	}
+
+	// Misc space variations
+	_map[0x202F] = u8(0xA0); // narrow NBSP
+	_map[0x205F] = u8(' ');  // medium mathematical space
+	_map[0x3164] = u8(' ');  // hangul filler
+
+	// Ideographic (CJK punctuation)
+	_map[0x3000] = u8(' ');  // space
+	_map[0x3001] = u8(',');  // comma
+	_map[0x3002] = u8('.');  // fullstop
+	_map[0x3003] = u8('"');  // ditto
+	_map[0x3007] = u8('0');  // wide zero
+	_map[0x3008] = u8('<');  // left angle brace
+	_map[0x3009] = u8('>');  // right angle brace
+	_map[0x300A] = u8(0xAB); // double left angle brace
+	_map[0x300B] = u8(0xBB); // double right angle brace
+	_map[0x300C] = u8('[');  // the following are all slight variations on the angular brace
+	_map[0x300D] = u8(']');
+	_map[0x300E] = u8('[');
+	_map[0x300F] = u8(']');
+	_map[0x3010] = u8('[');
+	_map[0x3011] = u8(']');
+	_map[0x3014] = u8('[');
+	_map[0x3015] = u8(']');
+	_map[0x3016] = u8('[');
+	_map[0x3017] = u8(']');
+	_map[0x3018] = u8('[');
+	_map[0x3019] = u8(']');
+	_map[0x301A] = u8('[');
+	_map[0x301B] = u8(']');
+	_map[0x301C] = u8('~');  // wave dash (inverted tilde)
+	_map[0x301D] = u8('"');  // reverse double prime quotation
+	_map[0x301E] = u8('"');  // double prime quotation
+	_map[0x301F] = u8('"');  // low double prime quotation
+	_map[0x3031] = u8('<');  // vertical kana repeat mark
+
+	return _map;
+}();
+
 std::string utf8_to_ascii8(const std::string& utf8_string)
 {
 	std::vector<u8> out;
 	out.reserve(utf8_string.length() + 1);

-	for (u32 index = 0; index < utf8_string.length(); ++index)
+	const auto end = utf8_string.length();
+	for (u32 index = 0; index < end; ++index)
 	{
 		const auto code = (u8)utf8_string[index];
 		if (code <= 0x7F)
@ -16,18 +71,50 @@ std::string utf8_to_ascii8(const std::string& utf8_string)
 			continue;
 		}

-		auto extra_bytes = (code <= 0xDF) ? 1u : (code <= 0xEF) ? 2u : 3u;
+		const auto extra_bytes = (code <= 0xDF) ? 1u : (code <= 0xEF) ? 2u : 3u;
+		if ((index + extra_bytes) > end)
+		{
+			// Malformed string, abort
+			LOG_ERROR(GENERAL, "Failed to decode supossedly malformed utf8 string '%s'", utf8_string);
+			break;
+		}
+
+		u32 u_code = 0;
+		switch (extra_bytes)
+		{
+		case 1:
+			// 11 bits, 6 + 5
+			u_code = (u32(code & 0x1F) << 6) | u32(utf8_string[index + 1] & 0x3F);
+			break;
+		case 2:
+			// 16 bits, 6 + 6 + 4
+			u_code = (u32(code & 0xF) << 12) | (u32(utf8_string[index + 1] & 0x3F) << 6) | u32(utf8_string[index + 2] & 0x3F);
+			break;
+		case 3:
+			// 21 bits, 6 + 6 + 6 + 3
+			u_code = (u32(code & 0x7) << 18) | (u32(utf8_string[index + 1] & 0x3F) << 12) | (u32(utf8_string[index + 2] & 0x3F) << 6) | u32(utf8_string[index + 3] & 0x3F);
+			break;
+		default:
+			fmt::throw_exception("Unreachable" HERE);
+		}
+
 		index += extra_bytes;

-		if (extra_bytes > 1 || (code & 0x1C))
+		if (u_code <= 0xFF)
+		{
+			// Latin-1 supplement block
+			out.push_back(u8(u_code));
+			continue;
+		}
+
+		auto replace = s_ascii_lowering_map.find(u_code);
+		if (replace == s_ascii_lowering_map.end())
 		{
-			// Needs more bits than we could represent with extended ASCII anyway
 			out.push_back('#');
 			continue;
 		}

-		u8 out_code = ((code & 0x3) << 6) | (u8(utf8_string[index]) & 0x3F);
-		out.push_back(out_code);
+		out.push_back(replace->second);
 	}

 	out.push_back(0);