From 9269a55e8bfbcf62db15685501d103c229053f4a Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Tue, 14 Dec 2021 21:52:55 +0700
Subject: [PATCH 1/8] Fixed input of long UTF-8 symbols in editor (#1423)

---
 src/ui/controls/edit.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp
index 570bd518..75bb8cbf 100644
--- a/src/ui/controls/edit.cpp
+++ b/src/ui/controls/edit.cpp
@@ -492,7 +492,10 @@ bool CEdit::EventProcess(const Event &event)
     if ( event.type == EVENT_TEXT_INPUT && !bControl && m_bFocus )
     {
         auto data = event.GetData<TextInputData>();
-        Insert(data->text[0]); // TODO: insert utf-8 char
+        for ( char c : data->text )
+        {
+            Insert(c);
+        }
         SendModifEvent();
         return true;
     }

From cd059bd51177d146ad44bbc404254d9ed4f43d6c Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Thu, 16 Dec 2021 21:36:37 +0700
Subject: [PATCH 2/8] Show questions instead of crushing in case of UTF-8
 errors

---
 src/common/stringutils.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp
index 1aa97920..3e99a7b7 100644
--- a/src/common/stringutils.cpp
+++ b/src/common/stringutils.cpp
@@ -182,10 +182,6 @@ int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos)
     if((c & 0xE0) == 0xC0)
         return 2;
 
-    // Invalid char - unexpected continuation byte
-    if((c & 0xC0) == 0x80)
-        throw std::invalid_argument("Unexpected UTF-8 continuation byte");
-
     return 1;
 }
 

From 65da4c42c4fa01162ddc3cc4a76ec51739e3177c Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Fri, 17 Dec 2021 00:18:08 +0700
Subject: [PATCH 3/8] CEdit::DeleteOne expands selection to delete integer
 number of UTF-8 symbols

---
 src/ui/controls/edit.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp
index 75bb8cbf..d9d31605 100644
--- a/src/ui/controls/edit.cpp
+++ b/src/ui/controls/edit.cpp
@@ -2787,6 +2787,11 @@ void CEdit::DeleteOne(int dir)
     }
 
     if ( m_cursor1 > m_cursor2 )  Math::Swap(m_cursor1, m_cursor2);
+
+    // Expands selection to delete integer number of UTF-8 symbols
+    while ( m_cursor1 > 0     && (m_text[m_cursor1] & 0xC0) == 0x80 )  m_cursor1 --;
+    while ( m_cursor2 < m_len && (m_text[m_cursor2] & 0xC0) == 0x80 )  m_cursor2 ++;
+
     hole = m_cursor2-m_cursor1;
     end = m_len-hole;
     for ( i=m_cursor1 ; i<end ; i++ )

From 69ea470a26b5b9b710d099a8764e9d7cbf2fe2ab Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Fri, 17 Dec 2021 00:49:16 +0700
Subject: [PATCH 4/8] CEdit::MoveChar never moves cursor between bytes of one
 UTF-8 symbol

---
 src/ui/controls/edit.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp
index d9d31605..037a8d06 100644
--- a/src/ui/controls/edit.cpp
+++ b/src/ui/controls/edit.cpp
@@ -2261,7 +2261,7 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect)
 {
     int     character;
 
-    if ( move == -1 )  // back?
+    if ( move == -1 )  // back
     {
         if ( bWord )
         {
@@ -2306,12 +2306,15 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect)
         }
         else
         {
-            m_cursor1 --;
-            if ( m_cursor1 < 0 )  m_cursor1 = 0;
+            if ( m_cursor1 > 0 )
+            {
+                m_cursor1 --;
+                while ( m_cursor1 > 0 && (m_text[m_cursor1] & 0xC0) == 0x80 )  m_cursor1 --;
+            }
         }
     }
 
-    if ( move == 1 )  // advance?
+    if ( move == 1 )  // advance
     {
         if ( bWord )
         {
@@ -2356,8 +2359,11 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect)
         }
         else
         {
-            m_cursor1 ++;
-            if ( m_cursor1 > m_len )  m_cursor1 = m_len;
+            if ( m_cursor1 < m_len )
+            {
+                m_cursor1 ++;
+                while ( m_cursor1 < m_len && (m_text[m_cursor1] & 0xC0) == 0x80 )  m_cursor1 ++;
+            }
         }
     }
 

From 05b68a4b80c4847498e89627fc47dab00f124eca Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Wed, 22 Dec 2021 21:52:00 +0700
Subject: [PATCH 5/8] Refactor: Create function isUtf8ContinuationByte

---
 src/common/stringutils.cpp |  4 ++++
 src/common/stringutils.h   |  3 +++
 src/ui/controls/edit.cpp   | 15 +++++++++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp
index 3e99a7b7..1e854c31 100644
--- a/src/common/stringutils.cpp
+++ b/src/common/stringutils.cpp
@@ -197,3 +197,7 @@ std::size_t StrUtils::Utf8StringLength(const std::string &str)
     return result;
 }
 
+bool StrUtils::isUtf8ContinuationByte(char c)
+{
+    return (c & 0b11'000000) == 0b10'000000;
+}
diff --git a/src/common/stringutils.h b/src/common/stringutils.h
index bdb24049..b6eca93c 100644
--- a/src/common/stringutils.h
+++ b/src/common/stringutils.h
@@ -87,5 +87,8 @@ int Utf8CharSizeAt(const std::string &str, unsigned int pos);
 //! Returns the length in characters of UTF-8 string \a str
 std::size_t Utf8StringLength(const std::string &str);
 
+//! Returns true if char is continuation UTF-8 byte
+bool isUtf8ContinuationByte(char c);
+
 } // namespace StrUtil
 
diff --git a/src/ui/controls/edit.cpp b/src/ui/controls/edit.cpp
index 037a8d06..168a97b8 100644
--- a/src/ui/controls/edit.cpp
+++ b/src/ui/controls/edit.cpp
@@ -27,6 +27,7 @@
 
 #include "common/logger.h"
 #include "common/make_unique.h"
+#include "common/stringutils.h"
 
 #include "common/resources/inputstream.h"
 #include "common/resources/outputstream.h"
@@ -2309,7 +2310,10 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect)
             if ( m_cursor1 > 0 )
             {
                 m_cursor1 --;
-                while ( m_cursor1 > 0 && (m_text[m_cursor1] & 0xC0) == 0x80 )  m_cursor1 --;
+                while ( m_cursor1 > 0 && StrUtils::isUtf8ContinuationByte(m_text[m_cursor1]) )
+                {
+                    m_cursor1 --;
+                }
             }
         }
     }
@@ -2362,7 +2366,10 @@ void CEdit::MoveChar(int move, bool bWord, bool bSelect)
             if ( m_cursor1 < m_len )
             {
                 m_cursor1 ++;
-                while ( m_cursor1 < m_len && (m_text[m_cursor1] & 0xC0) == 0x80 )  m_cursor1 ++;
+                while ( m_cursor1 < m_len && StrUtils::isUtf8ContinuationByte(m_text[m_cursor1]) )
+                {
+                    m_cursor1 ++;
+                }
             }
         }
     }
@@ -2795,8 +2802,8 @@ void CEdit::DeleteOne(int dir)
     if ( m_cursor1 > m_cursor2 )  Math::Swap(m_cursor1, m_cursor2);
 
     // Expands selection to delete integer number of UTF-8 symbols
-    while ( m_cursor1 > 0     && (m_text[m_cursor1] & 0xC0) == 0x80 )  m_cursor1 --;
-    while ( m_cursor2 < m_len && (m_text[m_cursor2] & 0xC0) == 0x80 )  m_cursor2 ++;
+    while ( m_cursor1 > 0     && StrUtils::isUtf8ContinuationByte(m_text[m_cursor1]) )  m_cursor1 --;
+    while ( m_cursor2 < m_len && StrUtils::isUtf8ContinuationByte(m_text[m_cursor2]) )  m_cursor2 ++;
 
     hole = m_cursor2-m_cursor1;
     end = m_len-hole;

From 4bce63e38d449085f06ca960fe38eceec649d84a Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Mon, 14 Feb 2022 18:09:08 +0700
Subject: [PATCH 6/8] Use 0b instead of 0x and check 1-byte prefix first in
 Utf8CharSizeAt

1-byte symbols is more common then 4-bytes symbols.
So checking 1-byte prefix first is more efficient.
---
 src/common/stringutils.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp
index 5c5e207a..bc11fee4 100644
--- a/src/common/stringutils.cpp
+++ b/src/common/stringutils.cpp
@@ -175,12 +175,14 @@ int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos)
         return 0;
 
     const char c = str[pos];
-    if((c & 0xF8) == 0xF0)
-        return 4;
-    if((c & 0xF0) == 0xE0)
-        return 3;
-    if((c & 0xE0) == 0xC0)
+    if((c & 0b1000'0000) == 0b0000'0000)
+        return 1;
+    if((c & 0b1110'0000) == 0b1100'0000)
         return 2;
+    if((c & 0b1111'0000) == 0b1110'0000)
+        return 3;
+    if((c & 0b1111'1000) == 0b1111'0000)
+        return 4;
 
     return 1;
 }
@@ -199,5 +201,5 @@ std::size_t StrUtils::Utf8StringLength(const std::string &str)
 
 bool StrUtils::isUtf8ContinuationByte(char c)
 {
-    return (c & 0b11'000000) == 0b10'000000;
+    return (c & 0b1100'0000) == 0b1000'0000;
 }

From d9e26c25160746a57c5a91dcb842389a8f71f761 Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Mon, 14 Feb 2022 17:17:13 +0700
Subject: [PATCH 7/8] Use std::invalid_argument in Utf8CharSizeAt

---
 src/common/stringutils.cpp   | 7 ++++++-
 src/graphics/engine/text.cpp | 9 ++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp
index bc11fee4..a7d30f6a 100644
--- a/src/common/stringutils.cpp
+++ b/src/common/stringutils.cpp
@@ -184,7 +184,12 @@ int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos)
     if((c & 0b1111'1000) == 0b1111'0000)
         return 4;
 
-    return 1;
+    // Invalid char - unexpected continuation byte
+    if (isUtf8ContinuationByte(c))
+        throw std::invalid_argument("Unexpected UTF-8 continuation byte");
+
+    // (c & 0b1111'1000) == 0b1111'1000 is true here
+    throw std::invalid_argument("Byte value has no sense in UTF-8");
 }
 
 std::size_t StrUtils::Utf8StringLength(const std::string &str)
diff --git a/src/graphics/engine/text.cpp b/src/graphics/engine/text.cpp
index 315a3fbb..7b6cb8a3 100644
--- a/src/graphics/engine/text.cpp
+++ b/src/graphics/engine/text.cpp
@@ -937,7 +937,14 @@ int CText::GetCharSizeAt(Gfx::FontType font, const std::string& text, unsigned i
     }
     else
     {
-        len = StrUtils::Utf8CharSizeAt(text, index);
+        try
+        {
+            len = StrUtils::Utf8CharSizeAt(text, index);
+        }
+        catch (std::invalid_argument &e)
+        {
+            len = 1;
+        }
     }
     return len;
 }

From 550d0f915bb8a489d86501eae48d8b47b44050d1 Mon Sep 17 00:00:00 2001
From: Evgeny Pestov <pestoffne@vtomske.ru>
Date: Mon, 14 Feb 2022 18:33:41 +0700
Subject: [PATCH 8/8] Use std::out_of_range in Utf8CharSizeAt

---
 src/common/stringutils.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/common/stringutils.cpp b/src/common/stringutils.cpp
index a7d30f6a..626c5c39 100644
--- a/src/common/stringutils.cpp
+++ b/src/common/stringutils.cpp
@@ -156,11 +156,17 @@ std::wstring StrUtils::Utf8StringToUnicode(const std::string &str)
 {
     std::wstring result;
     unsigned int pos = 0;
+    int len;
     while (pos < str.size())
     {
-        int len = StrUtils::Utf8CharSizeAt(str, pos);
-        if (len == 0)
+        try
+        {
+            len = StrUtils::Utf8CharSizeAt(str, pos);
+        }
+        catch (std::out_of_range &e)
+        {
             break;
+        }
 
         std::string ch = str.substr(pos, len);
         result += static_cast<wchar_t>(StrUtils::Utf8CharToUnicode(ch));
@@ -172,7 +178,7 @@ std::wstring StrUtils::Utf8StringToUnicode(const std::string &str)
 int StrUtils::Utf8CharSizeAt(const std::string &str, unsigned int pos)
 {
     if (pos >= str.size())
-        return 0;
+        throw std::out_of_range("Index is greater than size");
 
     const char c = str[pos];
     if((c & 0b1000'0000) == 0b0000'0000)