From 9269a55e8bfbcf62db15685501d103c229053f4a Mon Sep 17 00:00:00 2001 From: Evgeny Pestov Date: Tue, 14 Dec 2021 21:52:55 +0700 Subject: [PATCH 1/8] Fixed input of long UTF-8 symbols in editor (#1423) --- src/ui/controls/edit.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp index 570bd518..75bb8cbf 100644 --- a/src/ui/controls/edit.cpp +++ b/src/ui/controls/edit.cpp @@ -492,7 +492,10 @@ bool CEdit::EventProcess(const Event &event) if ( event.type == EVENT_TEXT_INPUT && !bControl && m_bFocus ) { auto data = event.GetData(); - Insert(data->text[0]); // TODO: insert utf-8 char + for ( char c : data->text ) + { + Insert(c); + } SendModifEvent(); return true; } From cd059bd51177d146ad44bbc404254d9ed4f43d6c Mon Sep 17 00:00:00 2001 From: Evgeny Pestov Date: Thu, 16 Dec 2021 21:36:37 +0700 Subject: [PATCH 2/8] Show questions instead of crushing in case of UTF-8 errors --- src/common/stringutils.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp index 1aa97920..3e99a7b7 100644 --- a/src/common/stringutils.cpp +++ b/src/common/stringutils.cpp @@ -182,10 +182,6 @@ int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos) if((c & 0xE0) == 0xC0) return 2; - // Invalid char - unexpected continuation byte - if((c & 0xC0) == 0x80) - throw std::invalid_argument("Unexpected UTF-8 continuation byte"); - return 1; } From 65da4c42c4fa01162ddc3cc4a76ec51739e3177c Mon Sep 17 00:00:00 2001 From: Evgeny Pestov Date: Fri, 17 Dec 2021 00:18:08 +0700 Subject: [PATCH 3/8] CEdit::DeleteOne expands selection to delete integer number of UTF-8 symbols --- src/ui/controls/edit.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp index 75bb8cbf..d9d31605 100644 --- a/src/ui/controls/edit.cpp +++ b/src/ui/controls/edit.cpp @@ -2787,6 +2787,11 @@ void CEdit::DeleteOne(int dir) } if ( m_cursor1 > m_cursor2 ) Math::Swap(m_cursor1, m_cursor2); + + // Expands selection to delete integer number of UTF-8 symbols + while ( m_cursor1 > 0 && (m_text[m_cursor1] & 0xC0) == 0x80 ) m_cursor1 --; + while ( m_cursor2 < m_len && (m_text[m_cursor2] & 0xC0) == 0x80 ) m_cursor2 ++; + hole = m_cursor2-m_cursor1; end = m_len-hole; for ( i=m_cursor1 ; i Date: Fri, 17 Dec 2021 00:49:16 +0700 Subject: [PATCH 4/8] CEdit::MoveChar never moves cursor between bytes of one UTF-8 symbol --- src/ui/controls/edit.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp index d9d31605..037a8d06 100644 --- a/src/ui/controls/edit.cpp +++ b/src/ui/controls/edit.cpp @@ -2261,7 +2261,7 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect) { int character; - if ( move == -1 ) // back? + if ( move == -1 ) // back { if ( bWord ) { @@ -2306,12 +2306,15 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect) } else { - m_cursor1 --; - if ( m_cursor1 < 0 ) m_cursor1 = 0; + if ( m_cursor1 > 0 ) + { + m_cursor1 --; + while ( m_cursor1 > 0 && (m_text[m_cursor1] & 0xC0) == 0x80 ) m_cursor1 --; + } } } - if ( move == 1 ) // advance? + if ( move == 1 ) // advance { if ( bWord ) { @@ -2356,8 +2359,11 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect) } else { - m_cursor1 ++; - if ( m_cursor1 > m_len ) m_cursor1 = m_len; + if ( m_cursor1 < m_len ) + { + m_cursor1 ++; + while ( m_cursor1 < m_len && (m_text[m_cursor1] & 0xC0) == 0x80 ) m_cursor1 ++; + } } } From 05b68a4b80c4847498e89627fc47dab00f124eca Mon Sep 17 00:00:00 2001 From: Evgeny Pestov Date: Wed, 22 Dec 2021 21:52:00 +0700 Subject: [PATCH 5/8] Refactor: Create function isUtf8ContinuationByte --- src/common/stringutils.cpp | 4 ++++ src/common/stringutils.h | 3 +++ src/ui/controls/edit.cpp | 15 +++++++++++---- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp index 3e99a7b7..1e854c31 100644 --- a/src/common/stringutils.cpp +++ b/src/common/stringutils.cpp @@ -197,3 +197,7 @@ std::size_t StrUtils::Utf8StringLength(const std::string &str) return result; } +bool StrUtils::isUtf8ContinuationByte(char c) +{ + return (c & 0b11'000000) == 0b10'000000; +} diff --git a/src/common/stringutils.h b/src/common/stringutils.h index bdb24049..b6eca93c 100644 --- a/src/common/stringutils.h +++ b/src/common/stringutils.h @@ -87,5 +87,8 @@ int Utf8CharSizeAt(const std::string &str, unsigned int pos); //! Returns the length in characters of UTF-8 string \a str std::size_t Utf8StringLength(const std::string &str); +//! Returns true if char is continuation UTF-8 byte +bool isUtf8ContinuationByte(char c); + } // namespace StrUtil diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp index 037a8d06..168a97b8 100644 --- a/src/ui/controls/edit.cpp +++ b/src/ui/controls/edit.cpp @@ -27,6 +27,7 @@ #include "common/logger.h" #include "common/make_unique.h" +#include "common/stringutils.h" #include "common/resources/inputstream.h" #include "common/resources/outputstream.h" @@ -2309,7 +2310,10 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect) if ( m_cursor1 > 0 ) { m_cursor1 --; - while ( m_cursor1 > 0 && (m_text[m_cursor1] & 0xC0) == 0x80 ) m_cursor1 --; + while ( m_cursor1 > 0 && StrUtils::isUtf8ContinuationByte(m_text[m_cursor1]) ) + { + m_cursor1 --; + } } } } @@ -2362,7 +2366,10 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect) if ( m_cursor1 < m_len ) { m_cursor1 ++; - while ( m_cursor1 < m_len && (m_text[m_cursor1] & 0xC0) == 0x80 ) m_cursor1 ++; + while ( m_cursor1 < m_len && StrUtils::isUtf8ContinuationByte(m_text[m_cursor1]) ) + { + m_cursor1 ++; + } } } } @@ -2795,8 +2802,8 @@ void CEdit::DeleteOne(int dir) if ( m_cursor1 > m_cursor2 ) Math::Swap(m_cursor1, m_cursor2); // Expands selection to delete integer number of UTF-8 symbols - while ( m_cursor1 > 0 && (m_text[m_cursor1] & 0xC0) == 0x80 ) m_cursor1 --; - while ( m_cursor2 < m_len && (m_text[m_cursor2] & 0xC0) == 0x80 ) m_cursor2 ++; + while ( m_cursor1 > 0 && StrUtils::isUtf8ContinuationByte(m_text[m_cursor1]) ) m_cursor1 --; + while ( m_cursor2 < m_len && StrUtils::isUtf8ContinuationByte(m_text[m_cursor2]) ) m_cursor2 ++; hole = m_cursor2-m_cursor1; end = m_len-hole; From 4bce63e38d449085f06ca960fe38eceec649d84a Mon Sep 17 00:00:00 2001 From: Evgeny Pestov Date: Mon, 14 Feb 2022 18:09:08 +0700 Subject: [PATCH 6/8] Use 0b instead of 0x and check 1-byte prefix first in Utf8CharSizeAt 1-byte symbols is more common then 4-bytes symbols. So checking 1-byte prefix first is more efficient. --- src/common/stringutils.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp index 5c5e207a..bc11fee4 100644 --- a/src/common/stringutils.cpp +++ b/src/common/stringutils.cpp @@ -175,12 +175,14 @@ int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos) return 0; const char c = str[pos]; - if((c & 0xF8) == 0xF0) - return 4; - if((c & 0xF0) == 0xE0) - return 3; - if((c & 0xE0) == 0xC0) + if((c & 0b1000'0000) == 0b0000'0000) + return 1; + if((c & 0b1110'0000) == 0b1100'0000) return 2; + if((c & 0b1111'0000) == 0b1110'0000) + return 3; + if((c & 0b1111'1000) == 0b1111'0000) + return 4; return 1; } @@ -199,5 +201,5 @@ std::size_t StrUtils::Utf8StringLength(const std::string &str) bool StrUtils::isUtf8ContinuationByte(char c) { - return (c & 0b11'000000) == 0b10'000000; + return (c & 0b1100'0000) == 0b1000'0000; } From d9e26c25160746a57c5a91dcb842389a8f71f761 Mon Sep 17 00:00:00 2001 From: Evgeny Pestov Date: Mon, 14 Feb 2022 17:17:13 +0700 Subject: [PATCH 7/8] Use std::invalid_argument in Utf8CharSizeAt --- src/common/stringutils.cpp | 7 ++++++- src/graphics/engine/text.cpp | 9 ++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp index bc11fee4..a7d30f6a 100644 --- a/src/common/stringutils.cpp +++ b/src/common/stringutils.cpp @@ -184,7 +184,12 @@ int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos) if((c & 0b1111'1000) == 0b1111'0000) return 4; - return 1; + // Invalid char - unexpected continuation byte + if (isUtf8ContinuationByte(c)) + throw std::invalid_argument("Unexpected UTF-8 continuation byte"); + + // (c & 0b1111'1000) == 0b1111'1000 is true here + throw std::invalid_argument("Byte value has no sense in UTF-8"); } std::size_t StrUtils::Utf8StringLength(const std::string &str) diff --git a/src/graphics/engine/text.cpp b/src/graphics/engine/text.cpp index 315a3fbb..7b6cb8a3 100644 --- a/src/graphics/engine/text.cpp +++ b/src/graphics/engine/text.cpp @@ -937,7 +937,14 @@ int CText::GetCharSizeAt(Gfx::FontType font, const std::string& text, unsigned i } else { - len = StrUtils::Utf8CharSizeAt(text, index); + try + { + len = StrUtils::Utf8CharSizeAt(text, index); + } + catch (std::invalid_argument &e) + { + len = 1; + } } return len; } From 550d0f915bb8a489d86501eae48d8b47b44050d1 Mon Sep 17 00:00:00 2001 From: Evgeny Pestov Date: Mon, 14 Feb 2022 18:33:41 +0700 Subject: [PATCH 8/8] Use std::out_of_range in Utf8CharSizeAt --- src/common/stringutils.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp index a7d30f6a..626c5c39 100644 --- a/src/common/stringutils.cpp +++ b/src/common/stringutils.cpp @@ -156,11 +156,17 @@ std::wstring StrUtils::Utf8StringToUnicode(const std::string &str) { std::wstring result; unsigned int pos = 0; + int len; while (pos < str.size()) { - int len = StrUtils::Utf8CharSizeAt(str, pos); - if (len == 0) + try + { + len = StrUtils::Utf8CharSizeAt(str, pos); + } + catch (std::out_of_range &e) + { break; + } std::string ch = str.substr(pos, len); result += static_cast(StrUtils::Utf8CharToUnicode(ch)); @@ -172,7 +178,7 @@ std::wstring StrUtils::Utf8StringToUnicode(const std::string &str) int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos) { if (pos >= str.size()) - return 0; + throw std::out_of_range("Index is greater than size"); const char c = str[pos]; if((c & 0b1000'0000) == 0b0000'0000)