diff --git a/Source/Core/Common/StringUtil.cpp b/Source/Core/Common/StringUtil.cpp index 79dc2a4269..8229b5052d 100644 --- a/Source/Core/Common/StringUtil.cpp +++ b/Source/Core/Common/StringUtil.cpp @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include #include @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -391,6 +390,245 @@ size_t StringUTF8CodePointCount(std::string_view str) return str.size() - std::ranges::count_if(str, [](char c) -> bool { return (c & 0xC0) == 0x80; }); } +constexpr char32_t UNICODE_REPLACEMENT_CHARACTER = 0xfffd; +constexpr char32_t UNICODE_LAST_CODE_POINT = 0x10ffff; + +constexpr u16 UNICODE_HIGH_SURROGATE = 0xd800; +constexpr u16 UNICODE_LOW_SURROGATE = 0xdc00; + +constexpr u16 SURROGATE_VALUE_MASK = 0x3ffu; + +static constexpr bool IsSurrogateCodePoint(char32_t code_point) +{ + return (code_point & 0xf800u) == UNICODE_HIGH_SURROGATE; +} + +template +requires(sizeof(CharType) == 1) +class UTF8Decoder +{ +public: + constexpr explicit UTF8Decoder(std::span chars) + : m_ptr{chars.data()}, m_end_ptr{m_ptr + chars.size()} + { + } + + auto RemainingCodeUnits() const { return m_end_ptr - m_ptr; } + + constexpr char32_t operator()() + { + assert(RemainingCodeUnits() > 0); + + const u8 first_code_unit = *m_ptr; + ++m_ptr; + + switch (std::countl_one(first_code_unit)) + { + case 0: // ASCII. + return first_code_unit; + case 2: + return FinishReadingSequence<2, 0x80>(first_code_unit); + case 3: + { + const u32 code_point = FinishReadingSequence<3, 0x800>(first_code_unit); + if (!IsSurrogateCodePoint(code_point)) + return code_point; + break; + } + case 4: + { + const u32 code_point = FinishReadingSequence<4, 0x10000>(first_code_unit); + if (code_point <= UNICODE_LAST_CODE_POINT) + return code_point; + break; + } + default: + break; + } + + return UNICODE_REPLACEMENT_CHARACTER; + } + +private: + template + constexpr u32 FinishReadingSequence(u8 first_code_unit) + { + // Remove the leading one bits. + u32 code_point = first_code_unit & (0x7fu >> ByteCount); + + for (u32 byte_count = ByteCount - 1; byte_count != 0; --byte_count) + { + if (RemainingCodeUnits() == 0) + return UNICODE_REPLACEMENT_CHARACTER; + + const auto code_unit = u8(*m_ptr); + + if (!IsContinuationByte(code_unit)) + return UNICODE_REPLACEMENT_CHARACTER; + + ++m_ptr; + + code_point = (code_point << 6u) | (code_unit & 0x3fu); + } + + // Overlong encoding. + if (code_point < FirstValidCodePoint) + return UNICODE_REPLACEMENT_CHARACTER; + + return code_point; + } + + static constexpr bool IsContinuationByte(u8 code_unit) { return std::countl_one(code_unit) == 1; } + + const CharType* m_ptr; + const CharType* const m_end_ptr; +}; + +class UTF8Encoder +{ +public: + static constexpr u32 GetMaxUnitsPerCodePoint() { return 4; } + + // `ptr` should point to at least 4 bytes. + // Returns the number of written code units (bytes). + template + requires(sizeof(CharType) == 1) + constexpr u32 operator()(char32_t code_point, CharType* ptr) + { + // ASCII. + if (code_point < 0x80u) + { + *ptr = u8(code_point); + return 1; + } + + if (code_point < 0x800u) + return WriteSequence<2>(code_point, ptr); + + if (code_point < 0x10000u) + return WriteSequence<3>(code_point, ptr); + + if (code_point <= UNICODE_LAST_CODE_POINT) + return WriteSequence<4>(code_point, ptr); + + return (*this)(UNICODE_REPLACEMENT_CHARACTER, ptr); + } + +private: + template + static constexpr u32 WriteSequence(u32 code_point, auto* ptr) + { + *ptr = u8((0xf0u << (4 - ByteCount)) | (code_point >> (6 * (ByteCount - 1)))); + + for (u32 i = ByteCount - 1; i != 0; --i) + { + ptr[i] = u8(0x80u | (code_point & 0x3fu)); + code_point >>= 6; + } + + return ByteCount; + } +}; + +template +requires(sizeof(CharType) == 2) +class UTF16Decoder +{ +public: + constexpr explicit UTF16Decoder(std::span chars) + : m_ptr{chars.data()}, m_end_ptr{m_ptr + chars.size()} + { + } + + auto RemainingCodeUnits() const { return m_end_ptr - m_ptr; } + + constexpr char32_t operator()() + { + assert(RemainingCodeUnits() > 0); + + const u16 first_code_unit = *m_ptr; + ++m_ptr; + + // Single code unit. + if (!IsSurrogateCodePoint(first_code_unit)) + return first_code_unit; + + // Unexpected low surrogate. + if (first_code_unit >= UNICODE_LOW_SURROGATE) + return UNICODE_REPLACEMENT_CHARACTER; + + // High surrogate at end of data. + if (RemainingCodeUnits() == 0) + return UNICODE_REPLACEMENT_CHARACTER; + + const u16 second_code_unit = *m_ptr; + + // High surrogate not followed by low surrogate. + if ((second_code_unit & u16(~SURROGATE_VALUE_MASK)) != UNICODE_LOW_SURROGATE) + return UNICODE_REPLACEMENT_CHARACTER; + + ++m_ptr; + + // We have a surrogate pair. + return (u32(first_code_unit & SURROGATE_VALUE_MASK) << 10u) + + u32(second_code_unit & SURROGATE_VALUE_MASK) + 0x10000u; + } + +private: + const CharType* m_ptr; + const CharType* const m_end_ptr; +}; + +class UTF16Encoder +{ +public: + static constexpr u32 GetMaxUnitsPerCodePoint() { return 2; } + + // `ptr` should point to at least 2 code units. + // Returns the number of written code units. + template + requires(sizeof(CharType) == 2) + constexpr u32 operator()(char32_t code_point, CharType* ptr) + { + if (code_point < 0x10000u) + { + *ptr = u16(code_point); + return 1; + } + + if (code_point > UNICODE_LAST_CODE_POINT) + return (*this)(UNICODE_REPLACEMENT_CHARACTER, ptr); + + // Create surrogate pair. + const u32 value = code_point - 0x10000; + ptr[0] = u16(((value >> 10u) & SURROGATE_VALUE_MASK) | UNICODE_HIGH_SURROGATE); + ptr[1] = u16((value & SURROGATE_VALUE_MASK) | UNICODE_LOW_SURROGATE); + return 2; + } +}; + +template +static constexpr std::basic_string +ReEncodeString(std::basic_string_view input) +{ + Decoder decoder{input}; + Encoder encoder; + + const auto max_code_units = input.size() * encoder.GetMaxUnitsPerCodePoint(); + + std::basic_string result; + result.resize_and_overwrite(max_code_units, [&](ResultCharType* buf, std::size_t) { + auto* position = buf; + + while (decoder.RemainingCodeUnits() != 0) + position += encoder(decoder(), position); + + return position - buf; + }); + + return result; +} + #ifdef _WIN32 static std::wstring CPToUTF16(u32 code_page, std::string_view input) @@ -567,14 +805,12 @@ std::string UTF16BEToUTF8(const char16_t* str, size_t max_size) std::string UTF16ToUTF8(std::u16string_view input) { - std::wstring_convert, char16_t> converter; - return converter.to_bytes(input.data(), input.data() + input.size()); + return ReEncodeString, UTF8Encoder, char>(input); } std::u16string UTF8ToUTF16(std::string_view input) { - std::wstring_convert, char16_t> converter; - return converter.from_bytes(input.data(), input.data() + input.size()); + return ReEncodeString, UTF16Encoder, char16_t>(input); } // This is a replacement for path::u8path, which is deprecated starting with C++20. diff --git a/Source/UnitTests/Common/StringUtilTest.cpp b/Source/UnitTests/Common/StringUtilTest.cpp index 59ba99a436..38e6c64390 100644 --- a/Source/UnitTests/Common/StringUtilTest.cpp +++ b/Source/UnitTests/Common/StringUtilTest.cpp @@ -261,18 +261,52 @@ TEST(StringUtil, CaseInsensitiveContains_OverlappingMatches) TEST(StringUtil, CharacterEncodingConversion) { + const std::string utf8_variety = "🎮 hello ¥ᚼᛒ﹏🐬"; + const std::u16string utf16_variety = u"🎮 hello ¥ᚼᛒ﹏🐬"; + + // UTF-16 -> UTF-8 + const std::string utf8_replacement_char = "\xef\xbf\xbd"; + + // Unmatched high surrogate. + EXPECT_EQ(UTF16ToUTF8(u"\xd800" + utf16_variety), utf8_replacement_char + utf8_variety); + EXPECT_EQ(UTF16ToUTF8(utf16_variety + u"\xdbff"), utf8_variety + utf8_replacement_char); + + // Unmatched low surrogate. + EXPECT_EQ(UTF16ToUTF8(u"\xdc00" + utf16_variety), utf8_replacement_char + utf8_variety); + EXPECT_EQ(UTF16ToUTF8(utf16_variety + u"\xdfff"), utf8_variety + utf8_replacement_char); + + // UTF-8 -> UTF-16 + const std::u16string utf16_replacement_char = u"\xfffd"; + + // Unexpected bytes. + EXPECT_EQ(UTF8ToUTF16("\x80" + utf8_variety), utf16_replacement_char + utf16_variety); + EXPECT_EQ(UTF8ToUTF16("\xf8" + utf8_variety), utf16_replacement_char + utf16_variety); + + // Overlong encodings. + EXPECT_EQ(UTF8ToUTF16("\xc0\x8a" + utf8_variety), utf16_replacement_char + utf16_variety); + EXPECT_EQ(UTF8ToUTF16("\xe0\x81\x8a" + utf8_variety), utf16_replacement_char + utf16_variety); + EXPECT_EQ(UTF8ToUTF16("\xf0\x81\x81\x8a" + utf8_variety), utf16_replacement_char + utf16_variety); + + // Non-terminated character sequences. + EXPECT_EQ(UTF8ToUTF16("\xa0" + utf8_variety), utf16_replacement_char + utf16_variety); + EXPECT_EQ(UTF8ToUTF16("\xc0\xf0"), utf16_replacement_char + utf16_replacement_char); + EXPECT_EQ(UTF8ToUTF16(utf8_variety + "\xf0\x9f"), utf16_variety + utf16_replacement_char); + EXPECT_EQ(UTF8ToUTF16("\xf0\x9fZ"), utf16_replacement_char + u"Z"); + + // Code point greater than U+10FFFF. + EXPECT_EQ(UTF8ToUTF16("\xf7\x80\x80\x80" + utf8_variety), utf16_replacement_char + utf16_variety); + + // Decoded surrogate code points are rejected. + EXPECT_EQ(UTF8ToUTF16("\xed\xb6\x81" + utf8_variety), utf16_replacement_char + utf16_variety); + // wstring EXPECT_EQ(WStringToUTF8(L"hello 🐬"), "hello 🐬"); - // UTF-16 - EXPECT_EQ(UTF16ToUTF8(u"hello 🐬"), "hello 🐬"); - EXPECT_EQ(UTF8ToUTF16("hello 🐬"), u"hello 🐬"); - // UTF-16BE - char16_t utf16be_str[] = u"hello 🐬"; + auto utf16be_str = utf16_variety; for (auto& c : utf16be_str) c = Common::swap16(c); - EXPECT_EQ(UTF16BEToUTF8(utf16be_str, 99), "hello 🐬"); + EXPECT_EQ(UTF16BEToUTF8(utf16be_str.c_str(), 99), utf8_variety); // Shift JIS EXPECT_EQ(SHIFTJISToUTF8("\x83\x43\x83\x8b\x83\x4a"), "イルカ");