Rewritten UTF-8 implementation using standard library

2023-08-14 21:46:05 +02:00 · 2023-08-14 21:46:05 +02:00 · b3c6e667f5
parent a48f13fb35
commit b3c6e667f5
5 changed files with 111 additions and 113 deletions
--- a/colobot-app/CMakeLists.txt
+++ b/colobot-app/CMakeLists.txt
@ -7,6 +7,10 @@ target_sources(colobot PRIVATE
    src/main.cpp
 )

+if(MSVC)
+    target_compile_options(colobotbase PRIVATE /utf-8)
+endif()
+
 if(PLATFORM_WINDOWS)
    target_sources(colobot PRIVATE ${PROJECT_BINARY_DIR}/desktop/colobot.rc)
 endif()
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -529,6 +529,10 @@ if(WINGETOPT)
    target_link_libraries(colobotbase PUBLIC wingetopt)
 endif()

+if(MSVC)
+    target_compile_options(colobotbase PRIVATE /utf-8)
+endif()
+
 # Additional libraries per platform
 if(PLATFORM_WINDOWS)
    # because it isn't included in standard linking libraries
--- a/src/common/stringutils.cpp
+++ b/src/common/stringutils.cpp
@ -109,28 +109,16 @@ void StrUtils::Trim(std::string& str)

 std::string StrUtils::UnicodeCharToUtf8(unsigned int ch)
 {
-    std::string result;
-    if (ch < 0x0080)
-    {
-        result += static_cast<char>(ch);
-    }
-    else if (ch < 0x0800)
-    {
-        char ch1 = 0xC0 | ((ch & 0x07C0) >> 6);
-        char ch2 = 0x80 | (ch & 0x3F);
-        result += ch1;
-        result += ch2;
-    }
-    else
-    {
-        char ch1 = 0xE0 | ((ch & 0xF000) >> 12);
-        char ch2 = 0x80 | ((ch & 0x07C0) >> 6);
-        char ch3 = 0x80 | (ch & 0x3F);
-        result += ch1;
-        result += ch2;
-        result += ch3;
-    }
-    return result;
+    std::array<char, 4> buffer;
+
+    std::mbstate_t state = {};
+
+    int count = wcrtomb(buffer.data(), static_cast<wchar_t>(ch), &state);
+
+    if (count == 0) count = 1;
+    else if (count == -1) throw std::invalid_argument("Invalid character");
+
+    return std::string(buffer.data(), count);
 }

 std::string StrUtils::UnicodeStringToUtf8(const std::wstring &str)
@ -147,55 +135,40 @@ unsigned int StrUtils::Utf8CharToUnicode(const std::string &ch)
    if (ch.empty())
        return 0;

-    unsigned int result = 0;
-    if ((ch[0] & 0x80) == 0)
-    {
-        if (ch.size() == 1)
-        result = static_cast<unsigned int>(ch[0]);
-    }
-    else if ((ch[0] & 0xC0) == 0xC0)
-    {
-        if (ch.size() == 2)
-        {
-            unsigned int ch1 = (ch[0] & 0x1F) << 6;
-            unsigned int ch2 = (ch[1] & 0x3F);
-            result = ch1 | ch2;
-        }
-    }
-    else
-    {
-        if (ch.size() == 3)
-        {
-            unsigned int ch1 = (ch[0] & 0xF0) << 12;
-            unsigned int ch2 = (ch[1] & 0xC0) << 6;
-            unsigned int ch3 = (ch[2] & 0xC0);
-            result = ch1 | ch2 | ch3;
-        }
-    }
+    std::mbstate_t state = {};

-    return result;
+    wchar_t c = 0;
+
+    int len = mbrtowc(&c, ch.data(), ch.size(), &state);
+
+    if (len == 0) return L'\0';
+    else if (len == -1) throw std::invalid_argument("Invalid character");
+    else if (len == -2) throw std::invalid_argument("Invalid character");
+
+    return c;
 }

 std::wstring StrUtils::Utf8StringToUnicode(const std::string &str)
 {
    std::wstring result;
-    unsigned int pos = 0;
-    int len;
-    while (pos < str.size())
-    {
-        try
-        {
-            len = StrUtils::Utf8CharSizeAt(str, pos);
-        }
-        catch (std::out_of_range &e)
-        {
-            break;
-        }
+    result.reserve(str.size());

-        std::string ch = str.substr(pos, len);
-        result += static_cast<wchar_t>(StrUtils::Utf8CharToUnicode(ch));
-        pos += len;
+    for (size_t i = 0; i < str.size();)
+    {
+        std::mbstate_t state = {};
+        wchar_t ch;
+
+        int len = std::mbrtowc(&ch, str.data() + i, str.size() - i, &state);
+
+        if (len == 0) len = 1;
+        else if (len == -1) throw std::invalid_argument("Invalid character");
+        else if (len == -2) throw std::invalid_argument("Invalid character");
+
+        i += len;
+
+        result += ch;
    }
+
    return result;
 }

@ -204,33 +177,36 @@ int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos)
    if (pos >= str.size())
        throw std::out_of_range("Index is greater than size");

-    const char c = str[pos];
-    if((c & 0b1000'0000) == 0b0000'0000)
-        return 1;
-    if((c & 0b1110'0000) == 0b1100'0000)
-        return 2;
-    if((c & 0b1111'0000) == 0b1110'0000)
-        return 3;
-    if((c & 0b1111'1000) == 0b1111'0000)
-        return 4;
+    std::mbstate_t state = {};

-    // Invalid char - unexpected continuation byte
-    if (isUtf8ContinuationByte(c))
-        throw std::invalid_argument("Unexpected UTF-8 continuation byte");
+    int len = std::mbrlen(str.data() + pos, str.size() - pos, &state);

-    // (c & 0b1111'1000) == 0b1111'1000 is true here
-    throw std::invalid_argument("Byte value has no sense in UTF-8");
+    if (len == 0) len = 1;
+    else if (len == -1) throw std::invalid_argument("Invalid character");
+    else if (len == -2) throw std::invalid_argument("Invalid character");
+
+    return len;
 }

 std::size_t StrUtils::Utf8StringLength(const std::string &str)
 {
    std::size_t result = 0;
-    unsigned int i = 0;
-    while (i < str.size())
+
+    for (size_t i = 0; i < str.size();)
    {
-        i += Utf8CharSizeAt(str, i);
+        std::mbstate_t state = {};
+
+        size_t count = std::mbrlen(str.data() + i, str.size() - i, &state);
+
+        if (count == 0) count = 1;
+        else if (count == -1) throw std::invalid_argument("Invalid character");
+        else if (count == -2) throw std::invalid_argument("Invalid character");
+
+        i += count;
+
        ++result;
    }
+
    return result;
 }

@ -242,6 +218,9 @@ bool StrUtils::isUtf8ContinuationByte(char c)
 std::string StrUtils::ToLower(const std::string& text)
 {
    std::string result;
+    result.reserve(text.size());
+
+    std::array<char, 4> buffer;

    for (size_t i = 0; i < text.size();)
    {
@ -250,24 +229,21 @@ std::string StrUtils::ToLower(const std::string& text)

        int len = std::mbrtowc(&ch, text.data() + i, text.size() - i, &state);

-        if (len == -1) throw std::invalid_argument("Invalid character");
-
        if (len == 0) len = 1;
+        else if (len == -1) throw std::invalid_argument("Invalid character");
+        else if (len == -2) throw std::invalid_argument("Invalid character");
+
+        i += len;

        ch = std::towlower(ch);

-        char buffer[8];
-
        state = {};
-        size_t count = std::wcrtomb(buffer, ch, &state);
-
-        if (count == -1) throw std::invalid_argument("Invalid character");
+        size_t count = std::wcrtomb(buffer.data(), ch, &state);

        if (count == 0) count = 1;
+        else if (count == -1) throw std::invalid_argument("Invalid character");

-        result.append(buffer, count);
-
-        i += len;
+        result.append(buffer.data(), count);
    }

    return result;
@ -276,6 +252,9 @@ std::string StrUtils::ToLower(const std::string& text)
 std::string StrUtils::ToUpper(const std::string& text)
 {
    std::string result;
+    result.reserve(text.size());
+
+    std::array<char, 4> buffer;

    for (size_t i = 0; i < text.size();)
    {
@ -284,24 +263,21 @@ std::string StrUtils::ToUpper(const std::string& text)

        size_t len = std::mbrtowc(&ch, text.data() + i, text.size() - i, &state);

-        if (len == -1) throw std::invalid_argument("Invalid character");
-
        if (len == 0) len = 1;
+        else if (len == -1) throw std::invalid_argument("Invalid character");
+        else if (len == -2) throw std::invalid_argument("Invalid character");
+
+        i += len;

        ch = std::towupper(ch);

-        char buffer[8];
-
        state = {};
-        size_t count = std::wcrtomb(buffer, ch, &state);
+        size_t count = std::wcrtomb(buffer.data(), ch, &state);

-        if (count == -1) throw std::invalid_argument("Invalid character");
-        
        if (count == 0) count = 1;
+        else if (count == -1) throw std::invalid_argument("Invalid character");

-        result.append(buffer, count);
-
-        i += len;
+        result.append(buffer.data(), count);
    }

    return result;
--- a/src/level/parser/parserparam.cpp
+++ b/src/level/parser/parserparam.cpp
@ -212,7 +212,7 @@ bool CLevelParserParam::AsBool()
    if (m_empty)
        throw CLevelParserExceptionMissingParam(this);
    std::string value = m_value;
-    boost::to_lower(value);
+    value = StrUtils::ToLower(value);
    if (value == "true") return true;
    if (value == "false") return false;
    return Cast<bool>("bool");
--- a/test/unit/common/stringutils_test.cpp
+++ b/test/unit/common/stringutils_test.cpp
@ -27,55 +27,69 @@ namespace StringUtilsTesta
 TEST(StringUtilTests, ReplaceShortToLong)
 {
    std::string text = "Test {123}, {123}, {123}{123} Test";
+    std::string expected = "Test [0987654], [0987654], [0987654][0987654] Test";

    auto result = StrUtils::Replace(text, "{123}", "[0987654]");

-    std::string expected = "Test [0987654], [0987654], [0987654][0987654] Test";
-
    EXPECT_EQ(result, expected);
 }

 TEST(StringUtilTests, ReplaceLongToShort)
 {
    std::string text = "Test {1234567}, {1234567}, {1234567}{1234567} Test";
+    std::string expected = "Test [64], [64], [64][64] Test";

    auto result = StrUtils::Replace(text, "{1234567}", "[64]");

-    std::string expected = "Test [64], [64], [64][64] Test";
-
    EXPECT_EQ(result, expected);
 }

 TEST(StringUtilTests, ReplaceSameLength)
 {
    std::string text = "Test {123}, {123}, {123}{123} Test";
+    std::string expected = "Test [432], [432], [432][432] Test";

    auto result = StrUtils::Replace(text, "{123}", "[432]");

-    std::string expected = "Test [432], [432], [432][432] Test";
-
    EXPECT_EQ(result, expected);
 }

+TEST(StringUtilTests, StringCodePointCounts)
+{
+    EXPECT_EQ(StrUtils::Utf8CharSizeAt("a", 0), 1);
+    EXPECT_EQ(StrUtils::Utf8CharSizeAt("ą", 0), 2);
+    EXPECT_EQ(StrUtils::Utf8CharSizeAt("中", 0), 3);
+}
+
+TEST(StringUtilTests, StringConversion)
+{
+    std::string text = u8",./;AaZzĄąĘę中";
+    std::wstring expected = L",./;AaZzĄąĘę中";
+
+    std::wstring unicode = StrUtils::Utf8StringToUnicode(text);
+    std::string result = StrUtils::UnicodeStringToUtf8(unicode);
+
+    EXPECT_EQ(result, text);
+    EXPECT_EQ(unicode, expected);
+}
+
 TEST(StringUtilTests, ToLowerTest)
 {
-    std::string text = u8",./;AaBbĄąĘę";
+    std::string text = u8",./;AaBbĄąĘę中";
+    std::string expected = u8",./;aabbąąęę中";

    auto result = StrUtils::ToLower(text);

-    std::string expected = u8",./;aabbąąęę";
-
    EXPECT_EQ(result, expected);
 }

 TEST(StringUtilTests, ToUpperTest)
 {
-    std::string text = u8",./;AaBbĄąĘę";
+    std::string text = u8",./;AaBbĄąĘę中";
+    std::string expected = u8",./;AABBĄĄĘĘ中";

    auto result = StrUtils::ToUpper(text);

-    std::string expected = u8",./;AABBĄĄĘĘ";
-
    EXPECT_EQ(result, expected);
 }