From 89659db7ee0b3869037757b8dca7855735d4b77e Mon Sep 17 00:00:00 2001 From: Connor McLaughlin Date: Sat, 30 Jul 2022 21:46:14 +1000 Subject: [PATCH] Path: Unicode handling and tests for SanitizeFileName --- src/common-tests/path_tests.cpp | 16 ++++++ src/common/event.cpp | 2 +- src/common/file_system.cpp | 68 ++++++++++++------------- src/common/path.h | 3 +- src/common/string_util.cpp | 75 +++++++++++++++++++++++++--- src/common/string_util.h | 8 ++- src/core/system.cpp | 4 +- src/frontend-common/achievements.cpp | 3 +- src/frontend-common/game_list.cpp | 3 +- 9 files changed, 131 insertions(+), 51 deletions(-) diff --git a/src/common-tests/path_tests.cpp b/src/common-tests/path_tests.cpp index 0f68bb4f6..d83addb1d 100644 --- a/src/common-tests/path_tests.cpp +++ b/src/common-tests/path_tests.cpp @@ -222,4 +222,20 @@ TEST(FileSystem, ChangeFileName) #else ASSERT_EQ(Path::ChangeFileName("/foo/bar", "baz"), "/foo/baz"); #endif +} + +TEST(FileSystem, SanitizeFileName) +{ + ASSERT_EQ(Path::SanitizeFileName(u8"foo"), u8"foo"); + ASSERT_EQ(Path::SanitizeFileName(u8"foo/bar"), u8"foo_bar"); + ASSERT_EQ(Path::SanitizeFileName(u8"f🙃o"), u8"f🙃o"); + ASSERT_EQ(Path::SanitizeFileName(u8"ŻąłóРстуぬねのはen🍪⟑η∏☉ⴤℹ︎∩₲ ₱⟑♰⫳🐱"), u8"ŻąłóРстуぬねのはen🍪⟑η∏☉ⴤℹ︎∩₲ ₱⟑♰⫳🐱"); +#ifdef _WIN32 + ASSERT_EQ(Path::SanitizeFileName(u8"foo:"), u8"foo_"); + ASSERT_EQ(Path::SanitizeFileName(u8"foo:bar."), u8"foo_bar_"); + ASSERT_EQ(Path::SanitizeFileName(u8"foo\\bar"), u8"foo_bar"); + ASSERT_EQ(Path::SanitizeFileName(u8"foo>bar"), u8"foo_bar"); + ASSERT_EQ(Path::SanitizeFileName(u8"foo\\bar", false), u8"foo\\bar"); +#endif + ASSERT_EQ(Path::SanitizeFileName(u8"foo/bar", false), u8"foo/bar"); } \ No newline at end of file diff --git a/src/common/event.cpp b/src/common/event.cpp index 126d9372d..9be68907e 100644 --- a/src/common/event.cpp +++ b/src/common/event.cpp @@ -98,7 +98,7 @@ bool Event::TryWait(u32 timeout_in_ms) EnterCriticalSection(&m_cs); while (!m_signaled.load() && (GetTickCount() - start) < timeout_in_ms) - SleepConditionVariableCS(&m_cv, &m_cs, INFINITE); + SleepConditionVariableCS(&m_cv, &m_cs, timeout_in_ms); const bool result = m_signaled.load(); diff --git a/src/common/file_system.cpp b/src/common/file_system.cpp index 5f16e376e..484f77189 100644 --- a/src/common/file_system.cpp +++ b/src/common/file_system.cpp @@ -60,16 +60,28 @@ static std::time_t ConvertFileTimeToUnixTime(const FILETIME& ft) } #endif -static inline bool FileSystemCharacterIsSane(char c, bool StripSlashes) +static inline bool FileSystemCharacterIsSane(char32_t c, bool strip_slashes) { - if (!(c >= 'a' && c <= 'z') && !(c >= 'A' && c <= 'Z') && !(c >= '0' && c <= '9') && c != ' ' && c != ' ' && - c != '_' && c != '-' && c != '.') - { - if (!StripSlashes && (c == '/' || c == '\\')) - return true; +#ifdef _WIN32 + // https://docs.microsoft.com/en-gb/windows/win32/fileio/naming-a-file?redirectedfrom=MSDN#naming-conventions + if ((c == U'/' || c == U'\\') && strip_slashes) + return false; + if (c == U'<' || c == U'>' || c == U':' || c == U'"' || c == U'|' || c == U'?' || c == U'*' || c == U'0' || + c <= static_cast(31)) + { return false; } +#else + if (c == '/' && strip_slashes) + return false; + + // macos doesn't allow colons, apparently +#ifdef __APPLE__ + if (c == U':') + return false; +#endif +#endif return true; } @@ -117,39 +129,27 @@ static inline void PathAppendString(std::string& dst, const T& src) } } -void Path::SanitizeFileName(char* Destination, u32 cbDestination, const char* FileName, bool StripSlashes /* = true */) +std::string Path::SanitizeFileName(const std::string_view& str, bool strip_slashes /* = true */) { - u32 i; - u32 fileNameLength = static_cast(std::strlen(FileName)); + std::string ret; + ret.reserve(str.length()); - if (FileName == Destination) + size_t pos = 0; + while (pos < str.length()) { - for (i = 0; i < fileNameLength; i++) - { - if (!FileSystemCharacterIsSane(FileName[i], StripSlashes)) - Destination[i] = '_'; - } + char32_t ch; + pos += StringUtil::DecodeUTF8(str, pos, &ch); + ch = FileSystemCharacterIsSane(ch, strip_slashes) ? ch : U'_'; + StringUtil::EncodeAndAppendUTF8(ret, ch); } - else - { - for (i = 0; i < fileNameLength && i < cbDestination; i++) - { - if (FileSystemCharacterIsSane(FileName[i], StripSlashes)) - Destination[i] = FileName[i]; - else - Destination[i] = '_'; - } - } -} -void Path::SanitizeFileName(std::string& Destination, bool StripSlashes /* = true*/) -{ - const std::size_t len = Destination.length(); - for (std::size_t i = 0; i < len; i++) - { - if (!FileSystemCharacterIsSane(Destination[i], StripSlashes)) - Destination[i] = '_'; - } +#ifdef _WIN32 + // Windows: Can't end filename with a period. + if (ret.length() > 0 && ret.back() == '.') + ret.back() = '_'; +#endif + + return ret; } bool Path::IsAbsolute(const std::string_view& path) diff --git a/src/common/path.h b/src/common/path.h index 75c99cea9..5ece83136 100644 --- a/src/common/path.h +++ b/src/common/path.h @@ -22,8 +22,7 @@ std::string Canonicalize(const std::string_view& path); void Canonicalize(std::string* path); /// Sanitizes a filename for use in a filesystem. -void SanitizeFileName(char* Destination, u32 cbDestination, const char* FileName, bool StripSlashes /* = true */); -void SanitizeFileName(std::string& Destination, bool StripSlashes = true); +std::string SanitizeFileName(const std::string_view& str, bool strip_slashes = true); /// Returns true if the specified path is an absolute path (C:\Path on Windows or /path on Unix). bool IsAbsolute(const std::string_view& path); diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 88e636805..b43aabe3d 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp @@ -295,25 +295,88 @@ bool StringUtil::ParseAssignmentString(const std::string_view& str, std::string_ return true; } -void StringUtil::AppendUTF16CharacterToUTF8(std::string& s, u16 ch) +void StringUtil::EncodeAndAppendUTF8(std::string& s, char32_t ch) { - if (ch & 0xf800) + if (ch <= 0x7F) { - s.push_back(static_cast(static_cast(0xe0 | static_cast(ch >> 12)))); + s.push_back(static_cast(static_cast(ch))); + } + else if (ch <= 0x07FF) + { + s.push_back(static_cast(static_cast(0xc0 | static_cast((ch >> 6) & 0x1f)))); + s.push_back(static_cast(static_cast(0x80 | static_cast((ch & 0x3f))))); + } + else if (ch <= 0xFFFF) + { + s.push_back(static_cast(static_cast(0xe0 | static_cast(((ch >> 12) & 0x0f))))); s.push_back(static_cast(static_cast(0x80 | static_cast(((ch >> 6) & 0x3f))))); s.push_back(static_cast(static_cast(0x80 | static_cast((ch & 0x3f))))); } - else if (ch & 0xff80) + else if (ch <= 0x10FFFF) { - s.push_back(static_cast(static_cast(0xc0 | static_cast((ch >> 6))))); + s.push_back(static_cast(static_cast(0xf0 | static_cast(((ch >> 18) & 0x07))))); + s.push_back(static_cast(static_cast(0x80 | static_cast(((ch >> 12) & 0x3f))))); + s.push_back(static_cast(static_cast(0x80 | static_cast(((ch >> 6) & 0x3f))))); s.push_back(static_cast(static_cast(0x80 | static_cast((ch & 0x3f))))); } else { - s.push_back(static_cast(static_cast(ch))); + s.push_back(static_cast(0xefu)); + s.push_back(static_cast(0xbfu)); + s.push_back(static_cast(0xbdu)); } } +size_t StringUtil::DecodeUTF8(const void* bytes, size_t length, char32_t* ch) +{ + const u8* s = reinterpret_cast(bytes); + if (s[0] < 0x80) + { + *ch = s[0]; + return 1; + } + else if ((s[0] & 0xe0) == 0xc0) + { + if (length < 2) + goto invalid; + + *ch = static_cast((static_cast(s[0] & 0x1f) << 6) | (static_cast(s[1] & 0x3f) << 0)); + return 2; + } + else if ((s[0] & 0xf0) == 0xe0) + { + if (length < 3) + goto invalid; + + *ch = static_cast((static_cast(s[0] & 0x0f) << 12) | (static_cast(s[1] & 0x3f) << 6) | + (static_cast(s[2] & 0x3f) << 0)); + return 3; + } + else if ((s[0] & 0xf8) == 0xf0 && (s[0] <= 0xf4)) + { + if (length < 4) + goto invalid; + + *ch = static_cast((static_cast(s[0] & 0x07) << 18) | (static_cast(s[1] & 0x3f) << 12) | + (static_cast(s[2] & 0x3f) << 6) | (static_cast(s[3] & 0x3f) << 0)); + return 4; + } + +invalid: + *ch = 0xFFFFFFFFu; + return 1; +} + +size_t StringUtil::DecodeUTF8(const std::string_view& str, size_t offset, char32_t* ch) +{ + return DecodeUTF8(str.data() + offset, str.length() - offset, ch); +} + +size_t StringUtil::DecodeUTF8(const std::string& str, size_t offset, char32_t* ch) +{ + return DecodeUTF8(str.data() + offset, str.length() - offset, ch); +} + #ifdef _WIN32 std::wstring StringUtil::UTF8StringToWideString(const std::string_view& str) diff --git a/src/common/string_util.h b/src/common/string_util.h index 0ad086778..f2e6741c5 100644 --- a/src/common/string_util.h +++ b/src/common/string_util.h @@ -185,7 +185,13 @@ void ReplaceAll(std::string* subject, const std::string_view& search, const std: bool ParseAssignmentString(const std::string_view& str, std::string_view* key, std::string_view* value); /// Appends a UTF-16/UTF-32 codepoint to a UTF-8 string. -void AppendUTF16CharacterToUTF8(std::string& s, u16 ch); +void EncodeAndAppendUTF8(std::string& s, char32_t ch); + +/// Decodes UTF-8 to a single codepoint, updating the position parameter. +/// Returns the number of bytes the codepoint took in the original string. +size_t DecodeUTF8(const void* bytes, size_t length, char32_t* ch); +size_t DecodeUTF8(const std::string_view& str, size_t offset, char32_t* ch); +size_t DecodeUTF8(const std::string& str, size_t offset, char32_t* ch); /// Strided memcpy/memcmp. ALWAYS_INLINE static void StrideMemCpy(void* dst, std::size_t dst_stride, const void* src, std::size_t src_stride, diff --git a/src/core/system.cpp b/src/core/system.cpp index 3149460a2..3304de665 100644 --- a/src/core/system.cpp +++ b/src/core/system.cpp @@ -703,9 +703,7 @@ std::optional System::GetRegionForPath(const char* image_path) std::string System::GetGameSettingsPath(const std::string_view& game_serial) { - std::string sanitized_serial(game_serial); - Path::SanitizeFileName(sanitized_serial); - + const std::string sanitized_serial(Path::SanitizeFileName(game_serial)); return Path::Combine(EmuFolders::GameSettings, fmt::format("{}.ini", sanitized_serial)); } diff --git a/src/frontend-common/achievements.cpp b/src/frontend-common/achievements.cpp index 1619d650d..dbd1eeef9 100644 --- a/src/frontend-common/achievements.cpp +++ b/src/frontend-common/achievements.cpp @@ -1747,8 +1747,7 @@ const std::string& Achievements::GetAchievementBadgePath(const Achievement& achi return badge_path; // well, this comes from the internet.... :) - std::string clean_name(achievement.badge_name); - Path::SanitizeFileName(clean_name); + const std::string clean_name(Path::SanitizeFileName(achievement.badge_name)); badge_path = Path::Combine(s_achievement_icon_cache_directory, fmt::format("{}{}.png", clean_name, achievement.locked ? "_lock" : "")); if (FileSystem::FileExists(badge_path.c_str())) diff --git a/src/frontend-common/game_list.cpp b/src/frontend-common/game_list.cpp index 02fdaeaf4..a0c40456a 100644 --- a/src/frontend-common/game_list.cpp +++ b/src/frontend-common/game_list.cpp @@ -660,8 +660,7 @@ std::string GameList::GetNewCoverImagePathForEntry(const Entry* entry, const cha } // Check for illegal characters, use serial instead. - std::string sanitized_name(entry->title); - Path::SanitizeFileName(sanitized_name); + const std::string sanitized_name(Path::SanitizeFileName(entry->title)); std::string name; if (sanitized_name != entry->title)