StringUtil: Make UTF16ToUTF8 and UTF8ToUTF16 use custom encoding/decoding implementation to eliminate usage of deprecated std::wstring_convert.

2026-03-21 17:49:58 -05:00 · 2026-01-15 21:52:10 -06:00 · 2026-01-15 21:52:10 -06:00 · f07ac93e55
commit f07ac93e55
parent 6711d77b99
2 changed files with 282 additions and 12 deletions
--- a/Source/Core/Common/StringUtil.cpp
+++ b/Source/Core/Common/StringUtil.cpp
@ -5,7 +5,7 @@

 #include <algorithm>
 #include <array>
-#include <codecvt>
+#include <cassert>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdio>
@ -13,7 +13,6 @@
 #include <cstring>
 #include <iomanip>
 #include <iterator>
-#include <locale>
 #include <sstream>
 #include <string>
 #include <vector>
@ -391,6 +390,245 @@ size_t StringUTF8CodePointCount(std::string_view str)
  return str.size() - std::ranges::count_if(str, [](char c) -> bool { return (c & 0xC0) == 0x80; });
 }

+constexpr char32_t UNICODE_REPLACEMENT_CHARACTER = 0xfffd;
+constexpr char32_t UNICODE_LAST_CODE_POINT = 0x10ffff;
+
+constexpr u16 UNICODE_HIGH_SURROGATE = 0xd800;
+constexpr u16 UNICODE_LOW_SURROGATE = 0xdc00;
+
+constexpr u16 SURROGATE_VALUE_MASK = 0x3ffu;
+
+static constexpr bool IsSurrogateCodePoint(char32_t code_point)
+{
+  return (code_point & 0xf800u) == UNICODE_HIGH_SURROGATE;
+}
+
+template <std::integral CharType>
+requires(sizeof(CharType) == 1)
+class UTF8Decoder
+{
+public:
+  constexpr explicit UTF8Decoder(std::span<const CharType> chars)
+      : m_ptr{chars.data()}, m_end_ptr{m_ptr + chars.size()}
+  {
+  }
+
+  auto RemainingCodeUnits() const { return m_end_ptr - m_ptr; }
+
+  constexpr char32_t operator()()
+  {
+    assert(RemainingCodeUnits() > 0);
+
+    const u8 first_code_unit = *m_ptr;
+    ++m_ptr;
+
+    switch (std::countl_one(first_code_unit))
+    {
+    case 0:  // ASCII.
+      return first_code_unit;
+    case 2:
+      return FinishReadingSequence<2, 0x80>(first_code_unit);
+    case 3:
+    {
+      const u32 code_point = FinishReadingSequence<3, 0x800>(first_code_unit);
+      if (!IsSurrogateCodePoint(code_point))
+        return code_point;
+      break;
+    }
+    case 4:
+    {
+      const u32 code_point = FinishReadingSequence<4, 0x10000>(first_code_unit);
+      if (code_point <= UNICODE_LAST_CODE_POINT)
+        return code_point;
+      break;
+    }
+    default:
+      break;
+    }
+
+    return UNICODE_REPLACEMENT_CHARACTER;
+  }
+
+private:
+  template <u32 ByteCount, u32 FirstValidCodePoint>
+  constexpr u32 FinishReadingSequence(u8 first_code_unit)
+  {
+    // Remove the leading one bits.
+    u32 code_point = first_code_unit & (0x7fu >> ByteCount);
+
+    for (u32 byte_count = ByteCount - 1; byte_count != 0; --byte_count)
+    {
+      if (RemainingCodeUnits() == 0)
+        return UNICODE_REPLACEMENT_CHARACTER;
+
+      const auto code_unit = u8(*m_ptr);
+
+      if (!IsContinuationByte(code_unit))
+        return UNICODE_REPLACEMENT_CHARACTER;
+
+      ++m_ptr;
+
+      code_point = (code_point << 6u) | (code_unit & 0x3fu);
+    }
+
+    // Overlong encoding.
+    if (code_point < FirstValidCodePoint)
+      return UNICODE_REPLACEMENT_CHARACTER;
+
+    return code_point;
+  }
+
+  static constexpr bool IsContinuationByte(u8 code_unit) { return std::countl_one(code_unit) == 1; }
+
+  const CharType* m_ptr;
+  const CharType* const m_end_ptr;
+};
+
+class UTF8Encoder
+{
+public:
+  static constexpr u32 GetMaxUnitsPerCodePoint() { return 4; }
+
+  // `ptr` should point to at least 4 bytes.
+  // Returns the number of written code units (bytes).
+  template <std::integral CharType>
+  requires(sizeof(CharType) == 1)
+  constexpr u32 operator()(char32_t code_point, CharType* ptr)
+  {
+    // ASCII.
+    if (code_point < 0x80u)
+    {
+      *ptr = u8(code_point);
+      return 1;
+    }
+
+    if (code_point < 0x800u)
+      return WriteSequence<2>(code_point, ptr);
+
+    if (code_point < 0x10000u)
+      return WriteSequence<3>(code_point, ptr);
+
+    if (code_point <= UNICODE_LAST_CODE_POINT)
+      return WriteSequence<4>(code_point, ptr);
+
+    return (*this)(UNICODE_REPLACEMENT_CHARACTER, ptr);
+  }
+
+private:
+  template <u32 ByteCount>
+  static constexpr u32 WriteSequence(u32 code_point, auto* ptr)
+  {
+    *ptr = u8((0xf0u << (4 - ByteCount)) | (code_point >> (6 * (ByteCount - 1))));
+
+    for (u32 i = ByteCount - 1; i != 0; --i)
+    {
+      ptr[i] = u8(0x80u | (code_point & 0x3fu));
+      code_point >>= 6;
+    }
+
+    return ByteCount;
+  }
+};
+
+template <std::integral CharType>
+requires(sizeof(CharType) == 2)
+class UTF16Decoder
+{
+public:
+  constexpr explicit UTF16Decoder(std::span<const CharType> chars)
+      : m_ptr{chars.data()}, m_end_ptr{m_ptr + chars.size()}
+  {
+  }
+
+  auto RemainingCodeUnits() const { return m_end_ptr - m_ptr; }
+
+  constexpr char32_t operator()()
+  {
+    assert(RemainingCodeUnits() > 0);
+
+    const u16 first_code_unit = *m_ptr;
+    ++m_ptr;
+
+    // Single code unit.
+    if (!IsSurrogateCodePoint(first_code_unit))
+      return first_code_unit;
+
+    // Unexpected low surrogate.
+    if (first_code_unit >= UNICODE_LOW_SURROGATE)
+      return UNICODE_REPLACEMENT_CHARACTER;
+
+    // High surrogate at end of data.
+    if (RemainingCodeUnits() == 0)
+      return UNICODE_REPLACEMENT_CHARACTER;
+
+    const u16 second_code_unit = *m_ptr;
+
+    // High surrogate not followed by low surrogate.
+    if ((second_code_unit & u16(~SURROGATE_VALUE_MASK)) != UNICODE_LOW_SURROGATE)
+      return UNICODE_REPLACEMENT_CHARACTER;
+
+    ++m_ptr;
+
+    // We have a surrogate pair.
+    return (u32(first_code_unit & SURROGATE_VALUE_MASK) << 10u) +
+           u32(second_code_unit & SURROGATE_VALUE_MASK) + 0x10000u;
+  }
+
+private:
+  const CharType* m_ptr;
+  const CharType* const m_end_ptr;
+};
+
+class UTF16Encoder
+{
+public:
+  static constexpr u32 GetMaxUnitsPerCodePoint() { return 2; }
+
+  // `ptr` should point to at least 2 code units.
+  // Returns the number of written code units.
+  template <std::integral CharType>
+  requires(sizeof(CharType) == 2)
+  constexpr u32 operator()(char32_t code_point, CharType* ptr)
+  {
+    if (code_point < 0x10000u)
+    {
+      *ptr = u16(code_point);
+      return 1;
+    }
+
+    if (code_point > UNICODE_LAST_CODE_POINT)
+      return (*this)(UNICODE_REPLACEMENT_CHARACTER, ptr);
+
+    // Create surrogate pair.
+    const u32 value = code_point - 0x10000;
+    ptr[0] = u16(((value >> 10u) & SURROGATE_VALUE_MASK) | UNICODE_HIGH_SURROGATE);
+    ptr[1] = u16((value & SURROGATE_VALUE_MASK) | UNICODE_LOW_SURROGATE);
+    return 2;
+  }
+};
+
+template <typename Decoder, typename Encoder, typename ResultCharType, typename InputCharType>
+static constexpr std::basic_string<ResultCharType>
+ReEncodeString(std::basic_string_view<InputCharType> input)
+{
+  Decoder decoder{input};
+  Encoder encoder;
+
+  const auto max_code_units = input.size() * encoder.GetMaxUnitsPerCodePoint();
+
+  std::basic_string<ResultCharType> result;
+  result.resize_and_overwrite(max_code_units, [&](ResultCharType* buf, std::size_t) {
+    auto* position = buf;
+
+    while (decoder.RemainingCodeUnits() != 0)
+      position += encoder(decoder(), position);
+
+    return position - buf;
+  });
+
+  return result;
+}
+
 #ifdef _WIN32

 static std::wstring CPToUTF16(u32 code_page, std::string_view input)
@ -567,14 +805,12 @@ std::string UTF16BEToUTF8(const char16_t* str, size_t max_size)

 std::string UTF16ToUTF8(std::u16string_view input)
 {
-  std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> converter;
-  return converter.to_bytes(input.data(), input.data() + input.size());
+  return ReEncodeString<UTF16Decoder<char16_t>, UTF8Encoder, char>(input);
 }

 std::u16string UTF8ToUTF16(std::string_view input)
 {
-  std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> converter;
-  return converter.from_bytes(input.data(), input.data() + input.size());
+  return ReEncodeString<UTF8Decoder<char>, UTF16Encoder, char16_t>(input);
 }

 // This is a replacement for path::u8path, which is deprecated starting with C++20.
--- a/Source/UnitTests/Common/StringUtilTest.cpp
+++ b/Source/UnitTests/Common/StringUtilTest.cpp
@ -261,18 +261,52 @@ TEST(StringUtil, CaseInsensitiveContains_OverlappingMatches)

 TEST(StringUtil, CharacterEncodingConversion)
 {
+  const std::string utf8_variety = "🎮 hello ¥ᚼᛒ﹏🐬";
+  const std::u16string utf16_variety = u"🎮 hello ¥ᚼᛒ﹏🐬";
+
+  // UTF-16 -> UTF-8
+  const std::string utf8_replacement_char = "\xef\xbf\xbd";
+
+  // Unmatched high surrogate.
+  EXPECT_EQ(UTF16ToUTF8(u"\xd800" + utf16_variety), utf8_replacement_char + utf8_variety);
+  EXPECT_EQ(UTF16ToUTF8(utf16_variety + u"\xdbff"), utf8_variety + utf8_replacement_char);
+
+  // Unmatched low surrogate.
+  EXPECT_EQ(UTF16ToUTF8(u"\xdc00" + utf16_variety), utf8_replacement_char + utf8_variety);
+  EXPECT_EQ(UTF16ToUTF8(utf16_variety + u"\xdfff"), utf8_variety + utf8_replacement_char);
+
+  // UTF-8 -> UTF-16
+  const std::u16string utf16_replacement_char = u"\xfffd";
+
+  // Unexpected bytes.
+  EXPECT_EQ(UTF8ToUTF16("\x80" + utf8_variety), utf16_replacement_char + utf16_variety);
+  EXPECT_EQ(UTF8ToUTF16("\xf8" + utf8_variety), utf16_replacement_char + utf16_variety);
+
+  // Overlong encodings.
+  EXPECT_EQ(UTF8ToUTF16("\xc0\x8a" + utf8_variety), utf16_replacement_char + utf16_variety);
+  EXPECT_EQ(UTF8ToUTF16("\xe0\x81\x8a" + utf8_variety), utf16_replacement_char + utf16_variety);
+  EXPECT_EQ(UTF8ToUTF16("\xf0\x81\x81\x8a" + utf8_variety), utf16_replacement_char + utf16_variety);
+
+  // Non-terminated character sequences.
+  EXPECT_EQ(UTF8ToUTF16("\xa0" + utf8_variety), utf16_replacement_char + utf16_variety);
+  EXPECT_EQ(UTF8ToUTF16("\xc0\xf0"), utf16_replacement_char + utf16_replacement_char);
+  EXPECT_EQ(UTF8ToUTF16(utf8_variety + "\xf0\x9f"), utf16_variety + utf16_replacement_char);
+  EXPECT_EQ(UTF8ToUTF16("\xf0\x9fZ"), utf16_replacement_char + u"Z");
+
+  // Code point greater than U+10FFFF.
+  EXPECT_EQ(UTF8ToUTF16("\xf7\x80\x80\x80" + utf8_variety), utf16_replacement_char + utf16_variety);
+
+  // Decoded surrogate code points are rejected.
+  EXPECT_EQ(UTF8ToUTF16("\xed\xb6\x81" + utf8_variety), utf16_replacement_char + utf16_variety);
+
  // wstring
  EXPECT_EQ(WStringToUTF8(L"hello 🐬"), "hello 🐬");

-  // UTF-16
-  EXPECT_EQ(UTF16ToUTF8(u"hello 🐬"), "hello 🐬");
-  EXPECT_EQ(UTF8ToUTF16("hello 🐬"), u"hello 🐬");
-
  // UTF-16BE
-  char16_t utf16be_str[] = u"hello 🐬";
+  auto utf16be_str = utf16_variety;
  for (auto& c : utf16be_str)
    c = Common::swap16(c);
-  EXPECT_EQ(UTF16BEToUTF8(utf16be_str, 99), "hello 🐬");
+  EXPECT_EQ(UTF16BEToUTF8(utf16be_str.c_str(), 99), utf8_variety);

  // Shift JIS
  EXPECT_EQ(SHIFTJISToUTF8("\x83\x43\x83\x8b\x83\x4a"), "イルカ");