mirror of
https://github.com/dolphin-emu/dolphin.git
synced 2026-03-21 17:49:58 -05:00
StringUtil: Make UTF16ToUTF8 and UTF8ToUTF16 use custom encoding/decoding implementation to eliminate usage of deprecated std::wstring_convert.
This commit is contained in:
parent
6711d77b99
commit
f07ac93e55
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <codecvt>
|
||||
#include <cassert>
|
||||
#include <cstdarg>
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
|
|
@ -13,7 +13,6 @@
|
|||
#include <cstring>
|
||||
#include <iomanip>
|
||||
#include <iterator>
|
||||
#include <locale>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
|
@ -391,6 +390,245 @@ size_t StringUTF8CodePointCount(std::string_view str)
|
|||
return str.size() - std::ranges::count_if(str, [](char c) -> bool { return (c & 0xC0) == 0x80; });
|
||||
}
|
||||
|
||||
constexpr char32_t UNICODE_REPLACEMENT_CHARACTER = 0xfffd;
|
||||
constexpr char32_t UNICODE_LAST_CODE_POINT = 0x10ffff;
|
||||
|
||||
constexpr u16 UNICODE_HIGH_SURROGATE = 0xd800;
|
||||
constexpr u16 UNICODE_LOW_SURROGATE = 0xdc00;
|
||||
|
||||
constexpr u16 SURROGATE_VALUE_MASK = 0x3ffu;
|
||||
|
||||
static constexpr bool IsSurrogateCodePoint(char32_t code_point)
|
||||
{
|
||||
return (code_point & 0xf800u) == UNICODE_HIGH_SURROGATE;
|
||||
}
|
||||
|
||||
template <std::integral CharType>
|
||||
requires(sizeof(CharType) == 1)
|
||||
class UTF8Decoder
|
||||
{
|
||||
public:
|
||||
constexpr explicit UTF8Decoder(std::span<const CharType> chars)
|
||||
: m_ptr{chars.data()}, m_end_ptr{m_ptr + chars.size()}
|
||||
{
|
||||
}
|
||||
|
||||
auto RemainingCodeUnits() const { return m_end_ptr - m_ptr; }
|
||||
|
||||
constexpr char32_t operator()()
|
||||
{
|
||||
assert(RemainingCodeUnits() > 0);
|
||||
|
||||
const u8 first_code_unit = *m_ptr;
|
||||
++m_ptr;
|
||||
|
||||
switch (std::countl_one(first_code_unit))
|
||||
{
|
||||
case 0: // ASCII.
|
||||
return first_code_unit;
|
||||
case 2:
|
||||
return FinishReadingSequence<2, 0x80>(first_code_unit);
|
||||
case 3:
|
||||
{
|
||||
const u32 code_point = FinishReadingSequence<3, 0x800>(first_code_unit);
|
||||
if (!IsSurrogateCodePoint(code_point))
|
||||
return code_point;
|
||||
break;
|
||||
}
|
||||
case 4:
|
||||
{
|
||||
const u32 code_point = FinishReadingSequence<4, 0x10000>(first_code_unit);
|
||||
if (code_point <= UNICODE_LAST_CODE_POINT)
|
||||
return code_point;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return UNICODE_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
|
||||
private:
|
||||
template <u32 ByteCount, u32 FirstValidCodePoint>
|
||||
constexpr u32 FinishReadingSequence(u8 first_code_unit)
|
||||
{
|
||||
// Remove the leading one bits.
|
||||
u32 code_point = first_code_unit & (0x7fu >> ByteCount);
|
||||
|
||||
for (u32 byte_count = ByteCount - 1; byte_count != 0; --byte_count)
|
||||
{
|
||||
if (RemainingCodeUnits() == 0)
|
||||
return UNICODE_REPLACEMENT_CHARACTER;
|
||||
|
||||
const auto code_unit = u8(*m_ptr);
|
||||
|
||||
if (!IsContinuationByte(code_unit))
|
||||
return UNICODE_REPLACEMENT_CHARACTER;
|
||||
|
||||
++m_ptr;
|
||||
|
||||
code_point = (code_point << 6u) | (code_unit & 0x3fu);
|
||||
}
|
||||
|
||||
// Overlong encoding.
|
||||
if (code_point < FirstValidCodePoint)
|
||||
return UNICODE_REPLACEMENT_CHARACTER;
|
||||
|
||||
return code_point;
|
||||
}
|
||||
|
||||
static constexpr bool IsContinuationByte(u8 code_unit) { return std::countl_one(code_unit) == 1; }
|
||||
|
||||
const CharType* m_ptr;
|
||||
const CharType* const m_end_ptr;
|
||||
};
|
||||
|
||||
class UTF8Encoder
|
||||
{
|
||||
public:
|
||||
static constexpr u32 GetMaxUnitsPerCodePoint() { return 4; }
|
||||
|
||||
// `ptr` should point to at least 4 bytes.
|
||||
// Returns the number of written code units (bytes).
|
||||
template <std::integral CharType>
|
||||
requires(sizeof(CharType) == 1)
|
||||
constexpr u32 operator()(char32_t code_point, CharType* ptr)
|
||||
{
|
||||
// ASCII.
|
||||
if (code_point < 0x80u)
|
||||
{
|
||||
*ptr = u8(code_point);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (code_point < 0x800u)
|
||||
return WriteSequence<2>(code_point, ptr);
|
||||
|
||||
if (code_point < 0x10000u)
|
||||
return WriteSequence<3>(code_point, ptr);
|
||||
|
||||
if (code_point <= UNICODE_LAST_CODE_POINT)
|
||||
return WriteSequence<4>(code_point, ptr);
|
||||
|
||||
return (*this)(UNICODE_REPLACEMENT_CHARACTER, ptr);
|
||||
}
|
||||
|
||||
private:
|
||||
template <u32 ByteCount>
|
||||
static constexpr u32 WriteSequence(u32 code_point, auto* ptr)
|
||||
{
|
||||
*ptr = u8((0xf0u << (4 - ByteCount)) | (code_point >> (6 * (ByteCount - 1))));
|
||||
|
||||
for (u32 i = ByteCount - 1; i != 0; --i)
|
||||
{
|
||||
ptr[i] = u8(0x80u | (code_point & 0x3fu));
|
||||
code_point >>= 6;
|
||||
}
|
||||
|
||||
return ByteCount;
|
||||
}
|
||||
};
|
||||
|
||||
template <std::integral CharType>
|
||||
requires(sizeof(CharType) == 2)
|
||||
class UTF16Decoder
|
||||
{
|
||||
public:
|
||||
constexpr explicit UTF16Decoder(std::span<const CharType> chars)
|
||||
: m_ptr{chars.data()}, m_end_ptr{m_ptr + chars.size()}
|
||||
{
|
||||
}
|
||||
|
||||
auto RemainingCodeUnits() const { return m_end_ptr - m_ptr; }
|
||||
|
||||
constexpr char32_t operator()()
|
||||
{
|
||||
assert(RemainingCodeUnits() > 0);
|
||||
|
||||
const u16 first_code_unit = *m_ptr;
|
||||
++m_ptr;
|
||||
|
||||
// Single code unit.
|
||||
if (!IsSurrogateCodePoint(first_code_unit))
|
||||
return first_code_unit;
|
||||
|
||||
// Unexpected low surrogate.
|
||||
if (first_code_unit >= UNICODE_LOW_SURROGATE)
|
||||
return UNICODE_REPLACEMENT_CHARACTER;
|
||||
|
||||
// High surrogate at end of data.
|
||||
if (RemainingCodeUnits() == 0)
|
||||
return UNICODE_REPLACEMENT_CHARACTER;
|
||||
|
||||
const u16 second_code_unit = *m_ptr;
|
||||
|
||||
// High surrogate not followed by low surrogate.
|
||||
if ((second_code_unit & u16(~SURROGATE_VALUE_MASK)) != UNICODE_LOW_SURROGATE)
|
||||
return UNICODE_REPLACEMENT_CHARACTER;
|
||||
|
||||
++m_ptr;
|
||||
|
||||
// We have a surrogate pair.
|
||||
return (u32(first_code_unit & SURROGATE_VALUE_MASK) << 10u) +
|
||||
u32(second_code_unit & SURROGATE_VALUE_MASK) + 0x10000u;
|
||||
}
|
||||
|
||||
private:
|
||||
const CharType* m_ptr;
|
||||
const CharType* const m_end_ptr;
|
||||
};
|
||||
|
||||
class UTF16Encoder
|
||||
{
|
||||
public:
|
||||
static constexpr u32 GetMaxUnitsPerCodePoint() { return 2; }
|
||||
|
||||
// `ptr` should point to at least 2 code units.
|
||||
// Returns the number of written code units.
|
||||
template <std::integral CharType>
|
||||
requires(sizeof(CharType) == 2)
|
||||
constexpr u32 operator()(char32_t code_point, CharType* ptr)
|
||||
{
|
||||
if (code_point < 0x10000u)
|
||||
{
|
||||
*ptr = u16(code_point);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (code_point > UNICODE_LAST_CODE_POINT)
|
||||
return (*this)(UNICODE_REPLACEMENT_CHARACTER, ptr);
|
||||
|
||||
// Create surrogate pair.
|
||||
const u32 value = code_point - 0x10000;
|
||||
ptr[0] = u16(((value >> 10u) & SURROGATE_VALUE_MASK) | UNICODE_HIGH_SURROGATE);
|
||||
ptr[1] = u16((value & SURROGATE_VALUE_MASK) | UNICODE_LOW_SURROGATE);
|
||||
return 2;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Decoder, typename Encoder, typename ResultCharType, typename InputCharType>
|
||||
static constexpr std::basic_string<ResultCharType>
|
||||
ReEncodeString(std::basic_string_view<InputCharType> input)
|
||||
{
|
||||
Decoder decoder{input};
|
||||
Encoder encoder;
|
||||
|
||||
const auto max_code_units = input.size() * encoder.GetMaxUnitsPerCodePoint();
|
||||
|
||||
std::basic_string<ResultCharType> result;
|
||||
result.resize_and_overwrite(max_code_units, [&](ResultCharType* buf, std::size_t) {
|
||||
auto* position = buf;
|
||||
|
||||
while (decoder.RemainingCodeUnits() != 0)
|
||||
position += encoder(decoder(), position);
|
||||
|
||||
return position - buf;
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
static std::wstring CPToUTF16(u32 code_page, std::string_view input)
|
||||
|
|
@ -567,14 +805,12 @@ std::string UTF16BEToUTF8(const char16_t* str, size_t max_size)
|
|||
|
||||
std::string UTF16ToUTF8(std::u16string_view input)
|
||||
{
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> converter;
|
||||
return converter.to_bytes(input.data(), input.data() + input.size());
|
||||
return ReEncodeString<UTF16Decoder<char16_t>, UTF8Encoder, char>(input);
|
||||
}
|
||||
|
||||
std::u16string UTF8ToUTF16(std::string_view input)
|
||||
{
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> converter;
|
||||
return converter.from_bytes(input.data(), input.data() + input.size());
|
||||
return ReEncodeString<UTF8Decoder<char>, UTF16Encoder, char16_t>(input);
|
||||
}
|
||||
|
||||
// This is a replacement for path::u8path, which is deprecated starting with C++20.
|
||||
|
|
|
|||
|
|
@ -261,18 +261,52 @@ TEST(StringUtil, CaseInsensitiveContains_OverlappingMatches)
|
|||
|
||||
TEST(StringUtil, CharacterEncodingConversion)
|
||||
{
|
||||
const std::string utf8_variety = "🎮 hello ¥ᚼᛒ﹏🐬";
|
||||
const std::u16string utf16_variety = u"🎮 hello ¥ᚼᛒ﹏🐬";
|
||||
|
||||
// UTF-16 -> UTF-8
|
||||
const std::string utf8_replacement_char = "\xef\xbf\xbd";
|
||||
|
||||
// Unmatched high surrogate.
|
||||
EXPECT_EQ(UTF16ToUTF8(u"\xd800" + utf16_variety), utf8_replacement_char + utf8_variety);
|
||||
EXPECT_EQ(UTF16ToUTF8(utf16_variety + u"\xdbff"), utf8_variety + utf8_replacement_char);
|
||||
|
||||
// Unmatched low surrogate.
|
||||
EXPECT_EQ(UTF16ToUTF8(u"\xdc00" + utf16_variety), utf8_replacement_char + utf8_variety);
|
||||
EXPECT_EQ(UTF16ToUTF8(utf16_variety + u"\xdfff"), utf8_variety + utf8_replacement_char);
|
||||
|
||||
// UTF-8 -> UTF-16
|
||||
const std::u16string utf16_replacement_char = u"\xfffd";
|
||||
|
||||
// Unexpected bytes.
|
||||
EXPECT_EQ(UTF8ToUTF16("\x80" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
EXPECT_EQ(UTF8ToUTF16("\xf8" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
|
||||
// Overlong encodings.
|
||||
EXPECT_EQ(UTF8ToUTF16("\xc0\x8a" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
EXPECT_EQ(UTF8ToUTF16("\xe0\x81\x8a" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
EXPECT_EQ(UTF8ToUTF16("\xf0\x81\x81\x8a" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
|
||||
// Non-terminated character sequences.
|
||||
EXPECT_EQ(UTF8ToUTF16("\xa0" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
EXPECT_EQ(UTF8ToUTF16("\xc0\xf0"), utf16_replacement_char + utf16_replacement_char);
|
||||
EXPECT_EQ(UTF8ToUTF16(utf8_variety + "\xf0\x9f"), utf16_variety + utf16_replacement_char);
|
||||
EXPECT_EQ(UTF8ToUTF16("\xf0\x9fZ"), utf16_replacement_char + u"Z");
|
||||
|
||||
// Code point greater than U+10FFFF.
|
||||
EXPECT_EQ(UTF8ToUTF16("\xf7\x80\x80\x80" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
|
||||
// Decoded surrogate code points are rejected.
|
||||
EXPECT_EQ(UTF8ToUTF16("\xed\xb6\x81" + utf8_variety), utf16_replacement_char + utf16_variety);
|
||||
|
||||
// wstring
|
||||
EXPECT_EQ(WStringToUTF8(L"hello 🐬"), "hello 🐬");
|
||||
|
||||
// UTF-16
|
||||
EXPECT_EQ(UTF16ToUTF8(u"hello 🐬"), "hello 🐬");
|
||||
EXPECT_EQ(UTF8ToUTF16("hello 🐬"), u"hello 🐬");
|
||||
|
||||
// UTF-16BE
|
||||
char16_t utf16be_str[] = u"hello 🐬";
|
||||
auto utf16be_str = utf16_variety;
|
||||
for (auto& c : utf16be_str)
|
||||
c = Common::swap16(c);
|
||||
EXPECT_EQ(UTF16BEToUTF8(utf16be_str, 99), "hello 🐬");
|
||||
EXPECT_EQ(UTF16BEToUTF8(utf16be_str.c_str(), 99), utf8_variety);
|
||||
|
||||
// Shift JIS
|
||||
EXPECT_EQ(SHIFTJISToUTF8("\x83\x43\x83\x8b\x83\x4a"), "イルカ");
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user