fix(multibyte): handle backspace of wide clusters in replace mode

Make utf_head_off more robust against invalid sequences and embedded NUL chars
author: bfredl <bjorn.linse@gmail.com> 2024-09-04 12:09:42 +0200
committer: bfredl <bjorn.linse@gmail.com> 2024-09-06 10:22:29 +0200
commit: fa99afe35eb5d8cf01d875e12b53165bf1104a60 (patch)
tree: 2a304fcd0262e71ef64d2bbb91fcbdc5eb89507d
parent: 7b7c95dac97d6ea4f10855cc198dce650a796c20 (diff)
download: rneovim-fa99afe35eb5d8cf01d875e12b53165bf1104a60.tar.gz
rneovim-fa99afe35eb5d8cf01d875e12b53165bf1104a60.tar.bz2
rneovim-fa99afe35eb5d8cf01d875e12b53165bf1104a60.zip
9 files changed, 195 insertions, 150 deletions
diff --git a/src/nvim/api/vim.c b/src/nvim/api/vim.c
index 7a4aefc620..630c534a7f 100644
--- a/src/nvim/api/vim.c
+++ b/src/nvim/api/vim.c
@@ -28,6 +28,7 @@
 #include "nvim/cursor.h"
 #include "nvim/decoration.h"
 #include "nvim/drawscreen.h"
+#include "nvim/edit.h"
 #include "nvim/errors.h"
 #include "nvim/eval.h"
 #include "nvim/eval/typval.h"
diff --git a/src/nvim/change.c b/src/nvim/change.c
index 47a9f0ce92..51a13b80e7 100644
--- a/src/nvim/change.c
+++ b/src/nvim/change.c
@@ -756,10 +756,8 @@ void ins_char_bytes(char *buf, size_t charlen)
     // put back when BS is used.  The bytes of a multi-byte character are
     // done the other way around, so that the first byte is popped off
     // first (it tells the byte length of the character).
-    replace_push(NUL);
-    for (size_t i = 0; i < oldlen; i++) {
-      i += (size_t)replace_push_mb(oldp + col + i) - 1;
-    }
+    replace_push_nul();
+    replace_push(oldp + col, oldlen);
   }
 
   char *newp = xmalloc(linelen + newlen - oldlen);
@@ -1137,12 +1135,10 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
     // on the line onto the replace stack.  We'll push any other characters
     // that might be replaced at the start of the next line (due to
     // autoindent etc) a bit later.
-    replace_push(NUL);      // Call twice because BS over NL expects it
-    replace_push(NUL);
+    replace_push_nul();      // Call twice because BS over NL expects it
+    replace_push_nul();
     p = saved_line + curwin->w_cursor.col;
-    while (*p != NUL) {
-      p += replace_push_mb(p);
-    }
+    replace_push(p, strlen(p));
     saved_line[curwin->w_cursor.col] = NUL;
   }
 
@@ -1691,13 +1687,13 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
     // stack, preceded by a NUL, so they can be put back when a BS is
     // entered.
     if (REPLACE_NORMAL(State)) {
-      replace_push(NUL);            // end of extra blanks
+      replace_push_nul();            // end of extra blanks
     }
     if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) {
       while ((*p_extra == ' ' || *p_extra == '\t')
              && !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) {
         if (REPLACE_NORMAL(State)) {
-          replace_push(*p_extra);
+          replace_push(p_extra, 1);  // always ascii, len = 1
         }
         p_extra++;
         less_cols_off++;
@@ -1794,7 +1790,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
     // must be a NUL on the replace stack, for when it is deleted with BS
     if (REPLACE_NORMAL(State)) {
       for (colnr_T n = 0; n < curwin->w_cursor.col; n++) {
-        replace_push(NUL);
+        replace_push_nul();
       }
     }
     newcol += curwin->w_cursor.col;
@@ -1808,7 +1804,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
   // must be a NUL on the replace stack, for when it is deleted with BS.
   if (REPLACE_NORMAL(State)) {
     while (lead_len-- > 0) {
-      replace_push(NUL);
+      replace_push_nul();
     }
   }
 
diff --git a/src/nvim/edit.c b/src/nvim/edit.c
index 64c54b0f37..13623eaa91 100644
--- a/src/nvim/edit.c
+++ b/src/nvim/edit.c
@@ -136,6 +136,8 @@ static TriState dont_sync_undo = kFalse;  // CTRL-G U prevents syncing undo
 
 static linenr_T o_lnum = 0;
 
+static kvec_t(char) replace_stack = KV_INITIAL_VALUE;
+
 static void insert_enter(InsertState *s)
 {
   s->did_backspace = true;
@@ -1618,9 +1620,8 @@ void undisplay_dollar(void)
 /// type == INDENT_SET   set indent to "amount"
 ///
 /// @param round               if true, round the indent to 'shiftwidth' (only with _INC and _Dec).
-/// @param replaced            replaced character, put on replace stack
 /// @param call_changed_bytes  call changed_bytes()
-void change_indent(int type, int amount, int round, int replaced, bool call_changed_bytes)
+void change_indent(int type, int amount, int round, bool call_changed_bytes)
 {
   int insstart_less;                    // reduction for Insstart.col
   colnr_T orig_col = 0;                 // init for GCC
@@ -1767,12 +1768,8 @@ void change_indent(int type, int amount, int round, int replaced, bool call_chan
       replace_join(0);              // remove a NUL from the replace stack
       start_col--;
     }
-    while (start_col < (int)curwin->w_cursor.col || replaced) {
-      replace_push(NUL);
-      if (replaced) {
-        replace_push(replaced);
-        replaced = NUL;
-      }
+    while (start_col < (int)curwin->w_cursor.col) {
+      replace_push_nul();
       start_col++;
     }
   }
@@ -2325,7 +2322,7 @@ int stop_arrow(void)
 static void stop_insert(pos_T *end_insert_pos, int esc, int nomove)
 {
   stop_redo_ins();
-  replace_flush();              // abandon replace stack
+  kv_destroy(replace_stack);  // abandon replace stack (reinitializes)
 
   // Save the inserted text for later redo with ^@ and CTRL-A.
   // Don't do it when "restart_edit" was set and nothing was inserted,
@@ -2802,57 +2799,51 @@ static bool echeck_abbr(int c)
 // that the NL replaced.  The extra one stores the characters after the cursor
 // that were deleted (always white space).
 
-static uint8_t *replace_stack = NULL;
-static ssize_t replace_stack_nr = 0;           // next entry in replace stack
-static ssize_t replace_stack_len = 0;          // max. number of entries
-
 /// Push character that is replaced onto the replace stack.
 ///
 /// replace_offset is normally 0, in which case replace_push will add a new
 /// character at the end of the stack.  If replace_offset is not 0, that many
 /// characters will be left on the stack above the newly inserted character.
 ///
-/// @param c character that is replaced (NUL is none)
-void replace_push(int c)
+/// @param str character that is replaced (NUL is none)
+/// @param len length of character in bytes
+void replace_push(char *str, size_t len)
 {
-  if (replace_stack_nr < replace_offset) {  // nothing to do
+  // TODO(bfredl): replace_offset is suss af, if we don't need it, this
+  // function is just kv_concat() :p
+  if (kv_size(replace_stack) < (size_t)replace_offset) {  // nothing to do
     return;
   }
 
-  if (replace_stack_len <= replace_stack_nr) {
-    replace_stack_len += 50;
-    replace_stack = xrealloc(replace_stack, (size_t)replace_stack_len);
-  }
-  uint8_t *p = replace_stack + replace_stack_nr - replace_offset;
+  kv_ensure_space(replace_stack, len);
+
+  char *p = replace_stack.items + kv_size(replace_stack) - replace_offset;
   if (replace_offset) {
-    memmove(p + 1, p, (size_t)replace_offset);
+    memmove(p + len, p, (size_t)replace_offset);
   }
-  *p = (uint8_t)c;
-  replace_stack_nr++;
+  memcpy(p, str, len);
+  kv_size(replace_stack) += len;
 }
 
-/// Push a character onto the replace stack.  Handles a multi-byte character in
-/// reverse byte order, so that the first byte is popped off first.
-///
-/// @return  the number of bytes done (includes composing characters).
-int replace_push_mb(char *p)
+/// push NUL as separator between entries in the stack
+void replace_push_nul(void)
 {
-  int l = utfc_ptr2len(p);
-
-  // TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping.
-  // or just keep a secondary array with char byte lenghts
-  for (int j = l - 1; j >= 0; j--) {
-    replace_push(p[j]);
-  }
-  return l;
+  replace_push("", 1);
 }
 
-/// Pop one item from the replace stack.
+/// Check top of replace stack, pop it if it was NUL
 ///
-/// @return -1 if stack is empty, replaced character or NUL otherwise
-static int replace_pop(void)
+/// when a non-NUL byte is found, use mb_replace_pop_ins() to
+/// pop one complete multibyte character.
+///
+/// @return -1 if stack is empty, last byte of char or NUL otherwise
+static int replace_pop_if_nul(void)
 {
-  return (replace_stack_nr == 0) ? -1 : (int)replace_stack[--replace_stack_nr];
+  int ch = (kv_size(replace_stack)) ? (uint8_t)kv_A(replace_stack, kv_size(replace_stack) - 1) : -1;
+  if (ch == NUL) {
+    kv_size(replace_stack)--;
+  }
+  return ch;
 }
 
 /// Join the top two items on the replace stack.  This removes to "off"'th NUL
@@ -2861,11 +2852,11 @@ static int replace_pop(void)
 /// @param off  offset for which NUL to remove
 static void replace_join(int off)
 {
-  for (ssize_t i = replace_stack_nr; --i >= 0;) {
-    if (replace_stack[i] == NUL && off-- <= 0) {
-      replace_stack_nr--;
-      memmove(replace_stack + i, replace_stack + i + 1,
-              (size_t)(replace_stack_nr - i));
+  for (ssize_t i = (ssize_t)kv_size(replace_stack); --i >= 0;) {
+    if (kv_A(replace_stack, i) == NUL && off-- <= 0) {
+      kv_size(replace_stack)--;
+      memmove(&kv_A(replace_stack, i), &kv_A(replace_stack, i + 1),
+              (kv_size(replace_stack) - (size_t)i));
       return;
     }
   }
@@ -2875,72 +2866,25 @@ static void replace_join(int off)
 /// before the cursor.  Can only be used in MODE_REPLACE or MODE_VREPLACE state.
 static void replace_pop_ins(void)
 {
-  int cc;
   int oldState = State;
 
   State = MODE_NORMAL;                       // don't want MODE_REPLACE here
-  while ((cc = replace_pop()) > 0) {
-    mb_replace_pop_ins(cc);
+  while ((replace_pop_if_nul()) > 0) {
+    mb_replace_pop_ins();
     dec_cursor();
   }
   State = oldState;
 }
 
-// Insert bytes popped from the replace stack. "cc" is the first byte.  If it
-// indicates a multi-byte char, pop the other bytes too.
-static void mb_replace_pop_ins(int cc)
-{
-  int n;
-  uint8_t buf[MB_MAXBYTES + 1];
-
-  if ((n = MB_BYTE2LEN(cc)) > 1) {
-    buf[0] = (uint8_t)cc;
-    for (int i = 1; i < n; i++) {
-      buf[i] = (uint8_t)replace_pop();
-    }
-    ins_bytes_len((char *)buf, (size_t)n);
-  } else {
-    ins_char(cc);
-  }
-
-  // Handle composing chars.
-  while (true) {
-    int c = replace_pop();
-    if (c == -1) {                // stack empty
-      break;
-    }
-    if ((n = MB_BYTE2LEN(c)) == 1) {
-      // Not a multi-byte char, put it back.
-      replace_push(c);
-      break;
-    }
-
-    buf[0] = (uint8_t)c;
-    assert(n > 1);
-    for (int i = 1; i < n; i++) {
-      buf[i] = (uint8_t)replace_pop();
-    }
-    // TODO(bfredl): by fixing replace_push_mb, upgrade to use
-    // the new composing algorithm
-    if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) {
-      ins_bytes_len((char *)buf, (size_t)n);
-    } else {
-      // Not a composing char, put it back.
-      for (int i = n - 1; i >= 0; i--) {
-        replace_push(buf[i]);
-      }
-      break;
-    }
-  }
-}
-
-// make the replace stack empty
-// (called when exiting replace mode)
-static void replace_flush(void)
+/// Insert multibyte char popped from the replace stack.
+///
+/// caller must already have checked the top of the stack is not NUL!!
+static void mb_replace_pop_ins(void)
 {
-  XFREE_CLEAR(replace_stack);
-  replace_stack_len = 0;
-  replace_stack_nr = 0;
+  int len = utf_head_off(&kv_A(replace_stack, 0),
+                         &kv_A(replace_stack, kv_size(replace_stack) - 1)) + 1;
+  kv_size(replace_stack) -= (size_t)len;
+  ins_bytes_len(&kv_A(replace_stack, kv_size(replace_stack)), (size_t)len);
 }
 
 // Handle doing a BS for one character.
@@ -2955,7 +2899,7 @@ static void replace_do_bs(int limit_col)
   colnr_T start_vcol;
   const int l_State = State;
 
-  int cc = replace_pop();
+  int cc = replace_pop_if_nul();
   if (cc > 0) {
     int orig_len = 0;
     int orig_vcols = 0;
@@ -2969,7 +2913,6 @@ static void replace_do_bs(int limit_col)
     if (l_State & VREPLACE_FLAG) {
       orig_len = get_cursor_pos_len();
     }
-    replace_push(cc);
     replace_pop_ins();
 
     if (l_State & VREPLACE_FLAG) {
@@ -3628,9 +3571,9 @@ static void ins_shift(int c, int lastc)
     if (lastc == '^') {
       old_indent = get_indent();        // remember curr. indent
     }
-    change_indent(INDENT_SET, 0, true, 0, true);
+    change_indent(INDENT_SET, 0, true, true);
   } else {
-    change_indent(c == Ctrl_D ? INDENT_DEC : INDENT_INC, 0, true, 0, true);
+    change_indent(c == Ctrl_D ? INDENT_DEC : INDENT_INC, 0, true, true);
   }
 
   if (did_ai && *skipwhite(get_cursor_line_ptr()) != NUL) {
@@ -3749,7 +3692,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
     // cc >= 0: NL was replaced, put original characters back
     cc = -1;
     if (State & REPLACE_FLAG) {
-      cc = replace_pop();           // returns -1 if NL was inserted
+      cc = replace_pop_if_nul();  // returns -1 if NL was inserted
     }
     // In replace mode, in the line we started replacing, we only move the
     // cursor.
@@ -3795,9 +3738,9 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
         // restore characters (blanks) deleted after cursor
         while (cc > 0) {
           colnr_T save_col = curwin->w_cursor.col;
-          mb_replace_pop_ins(cc);
+          mb_replace_pop_ins();
           curwin->w_cursor.col = save_col;
-          cc = replace_pop();
+          cc = replace_pop_if_nul();
         }
         // restore the characters that NL replaced
         replace_pop_ins();
@@ -3906,7 +3849,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
         } else {
           ins_str(" ");
           if ((State & REPLACE_FLAG)) {
-            replace_push(NUL);
+            replace_push_nul();
           }
         }
       }
@@ -4316,7 +4259,7 @@ static bool ins_tab(void)
     } else {
       ins_str(" ");
       if (State & REPLACE_FLAG) {            // no char replaced
-        replace_push(NUL);
+        replace_push_nul();
       }
     }
   }
@@ -4483,7 +4426,7 @@ bool ins_eol(int c)
   // character under the cursor.  Only push a NUL on the replace stack,
   // nothing to put back when the NL is deleted.
   if ((State & REPLACE_FLAG) && !(State & VREPLACE_FLAG)) {
-    replace_push(NUL);
+    replace_push_nul();
   }
 
   // In MODE_VREPLACE state, a NL replaces the rest of the line, and starts
@@ -4684,7 +4627,7 @@ static void ins_try_si(int c)
       i = get_indent();
       curwin->w_cursor = old_pos;
       if (State & VREPLACE_FLAG) {
-        change_indent(INDENT_SET, i, false, NUL, true);
+        change_indent(INDENT_SET, i, false, true);
       } else {
         set_indent(i, SIN_CHANGED);
       }
diff --git a/src/nvim/indent.c b/src/nvim/indent.c
index 2f994036ad..895d0d9f37 100644
--- a/src/nvim/indent.c
+++ b/src/nvim/indent.c
@@ -1407,7 +1407,7 @@ void fixthisline(IndentGetter get_the_indent)
     return;
   }
 
-  change_indent(INDENT_SET, amount, false, 0, true);
+  change_indent(INDENT_SET, amount, false, true);
   if (linewhite(curwin->w_cursor.lnum)) {
     did_ai = true;  // delete the indent if the line stays empty
   }
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 6fd51e773d..756ebd7b31 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -523,12 +523,14 @@ int utf_ptr2cells(const char *p_in)
 }
 
 /// Convert a UTF-8 byte sequence to a character number.
-/// Doesn't handle ascii! only multibyte and illegal sequences.
+/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL)
+/// are treated like illegal sequences.
 ///
 /// @param[in]  p      String to convert.
 /// @param[in]  len    Length of the character in bytes, 0 or 1 if illegal.
 ///
-/// @return Unicode codepoint. A negative value when the sequence is illegal.
+/// @return Unicode codepoint. A negative value when the sequence is illegal (or
+///         ASCII, including NUL).
 int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
   FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
 {
@@ -1780,15 +1782,15 @@ int utf_head_off(const char *base_in, const char *p_in)
     start--;
   }
 
-  uint8_t cur_len = utf8len_tab[*start];
-  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
-  if (cur_code < 0) {
+  const uint8_t last_len = utf8len_tab[*start];
+  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len);
+  if (cur_code < 0 || p - start >= last_len) {
     return 0;  // p must be part of an illegal sequence
   }
-  const uint8_t * const safe_end = start + cur_len;
+  const uint8_t * const safe_end = start + last_len;
 
   int cur_bc = utf8proc_get_property(cur_code)->boundclass;
-  if (always_break(cur_bc)) {
+  if (always_break(cur_bc) || start == base) {
     return (int)(p - start);
   }
 
@@ -1796,18 +1798,23 @@ int utf_head_off(const char *base_in, const char *p_in)
   const uint8_t *cur_pos = start;
   const uint8_t *const p_start = start;
 
-  if (start == base) {
-    return (int)(p - start);
-  }
+  while (true) {
+    if (start[-1] == NUL) {
+      break;
+    }
+
+    start--;
+    if (*start < 0x80) {  // stop on ascii, we are done
+      break;
+    }
 
-  start--;
-  while (*start >= 0x80) {  // stop on ascii, we are done
     while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
       start--;
     }
 
-    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
-    if (prev_code < 0) {
+    int prev_len = utf8len_tab[*start];
+    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len);
+    if (prev_code < 0 || prev_len < cur_pos - start) {
       start = cur_pos;  // start at valid sequence after invalid bytes
       break;
     }
@@ -1822,12 +1829,10 @@ int utf_head_off(const char *base_in, const char *p_in)
     cur_pos = start;
     cur_bc = prev_bc;
     cur_code = prev_code;
-
-    start--;
   }
 
   // hot path: we are already on the first codepoint of a sequence
-  if (start == p_start) {
+  if (start == p_start && last_len > p - start) {
     return (int)(p - start);
   }
 
diff --git a/src/nvim/ops.c b/src/nvim/ops.c
index 4b3f69a378..8faf0a6b47 100644
--- a/src/nvim/ops.c
+++ b/src/nvim/ops.c
@@ -306,7 +306,7 @@ void shift_line(bool left, bool round, int amount, int call_changed_bytes)
 
   // Set new indent
   if (State & VREPLACE_FLAG) {
-    change_indent(INDENT_SET, count, false, NUL, call_changed_bytes);
+    change_indent(INDENT_SET, count, false, call_changed_bytes);
   } else {
     set_indent(count, call_changed_bytes ? SIN_CHANGED : 0);
   }
diff --git a/src/nvim/textformat.c b/src/nvim/textformat.c
index 30c7d0ee92..9095d4e8c9 100644
--- a/src/nvim/textformat.c
+++ b/src/nvim/textformat.c
@@ -400,7 +400,7 @@ void internal_format(int textwidth, int second_indent, int flags, bool format_on
         }
         if (second_indent >= 0) {
           if (State & VREPLACE_FLAG) {
-            change_indent(INDENT_SET, second_indent, false, NUL, true);
+            change_indent(INDENT_SET, second_indent, false, true);
           } else if (leader_len > 0 && second_indent - leader_len > 0) {
             int padding = second_indent - leader_len;
 
diff --git a/test/functional/editor/mode_insert_spec.lua b/test/functional/editor/mode_insert_spec.lua
index fc1e6c4ee4..87d5c46134 100644
--- a/test/functional/editor/mode_insert_spec.lua
+++ b/test/functional/editor/mode_insert_spec.lua
@@ -351,4 +351,97 @@ describe('insert-mode', function()
       eq(2, api.nvim_win_get_cursor(0)[1])
     end)
   end)
+
+  it('backspace after replacing multibyte chars', function()
+    local screen = Screen.new(30, 3)
+    screen:attach()
+    api.nvim_buf_set_lines(0, 0, -1, true, { 'test ȧ̟̜̝̅̚m̆̉̐̐̇̈ å' })
+    feed('^Rabcdefghi')
+    screen:expect([[
+      abcdefghi^                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abcdefgh^å                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abcdefg^ å                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abcdef^m̆̉̐̐̇̈ å                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abcde^ȧ̟̜̝̅̚m̆̉̐̐̇̈ å                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abcd^ ȧ̟̜̝̅̚m̆̉̐̐̇̈ å                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<esc>')
+
+    api.nvim_buf_set_lines(0, 0, -1, true, { 'wow 🧑‍🌾🏳️‍⚧️x' })
+    feed('^Rabcd')
+
+    screen:expect([[
+      abcd^🧑‍🌾🏳️‍⚧️x                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('e')
+    screen:expect([[
+      abcde^🏳️‍⚧️x                      |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('f')
+    screen:expect([[
+      abcdef^x                       |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abcde^🏳️‍⚧️x                      |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abcd^🧑‍🌾🏳️‍⚧️x                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+
+    feed('<bs>')
+    screen:expect([[
+      abc^ 🧑‍🌾🏳️‍⚧️x                     |
+      {1:~                             }|
+      {5:-- REPLACE --}                 |
+    ]])
+  end)
 end)
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua
index 787a8862ae..62390c8794 100644
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -4,7 +4,6 @@ local itp = t.gen_itp(it)
 local ffi = t.ffi
 local eq = t.eq
 local to_cstr = t.to_cstr
-local ok = t.ok
 
 local lib = t.cimport(
   './src/nvim/mbyte.h',
@@ -302,7 +301,10 @@ describe('mbyte', function()
       local mb_glyphs = {}
       while pos < len do
         local clen = lib.utfc_ptr2len(cstr + pos)
-        ok(clen > 0) -- otherwise we get stuck
+        if clen == 0 then
+          eq(0, string.byte(str, pos + 1)) -- only NUL bytes can has length zery
+          clen = 1 -- but skip it, otherwise we get stuck
+        end
         if clen > 1 then
           table.insert(mb_glyphs, string.sub(str, pos + 1, pos + clen))
         end
@@ -325,13 +327,18 @@ describe('mbyte', function()
     -- stylua doesn't like ZWJ chars..
     -- stylua: ignore start
     check('hej och hå 🧑‍🌾!', { 'å', '🧑‍🌾' })
-    -- emoji only (various kinds of combinations, use g8 to see them)
+
+    -- emoji (various kinds of combinations, use g8 to see them)
     check("🏳️‍⚧️🧑‍🌾❤️😂🏴‍☠️", {"🏳️‍⚧️", "🧑‍🌾", "❤️", "😂", "🏴‍☠️"})
     check('🏳️‍⚧️xy🧑‍🌾\r❤️😂å🏴‍☠️', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '' })
+    check('🏳️‍⚧️\000🧑‍🌾\000❤️\000😂\000å\000🏴‍☠️\000', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', 'å', '🏴‍☠️', '' })
+    check('\195🏳️‍⚧️\198🧑‍🌾\165❤️\168\195😂\255🏴‍☠️\129\165', { '🏳️‍⚧️', '🧑‍🌾', '❤️', '😂', '🏴‍☠️', '' })
 
     check('🇦🅱️ 🇦🇽 🇦🇨🇦 🇲🇽🇹🇱',{'🇦', '🅱️', '🇦🇽', '🇦🇨', '🇦', '🇲🇽', '🇹🇱'})
     check('🏴󠁧󠁢󠁳󠁣󠁴󠁿🏴󠁧󠁢󠁷󠁬󠁳󠁿', {'🏴󠁧󠁢󠁳󠁣󠁴󠁿', '🏴󠁧󠁢󠁷󠁬󠁳󠁿'})
 
+    check('å\165ü\195aëq\168β\000\169本\255', {'å', 'ü', 'ë', 'β', '本'})
+
     lib.p_arshape = true -- default
     check('سلام', { 'س', 'لا', 'م' })
     lib.p_arshape = false
author	bfredl <bjorn.linse@gmail.com>	2024-09-04 12:09:42 +0200
committer	bfredl <bjorn.linse@gmail.com>	2024-09-06 10:22:29 +0200
commit	fa99afe35eb5d8cf01d875e12b53165bf1104a60 (patch)
tree	2a304fcd0262e71ef64d2bbb91fcbdc5eb89507d
parent	7b7c95dac97d6ea4f10855cc198dce650a796c20 (diff)
download	rneovim-fa99afe35eb5d8cf01d875e12b53165bf1104a60.tar.gz rneovim-fa99afe35eb5d8cf01d875e12b53165bf1104a60.tar.bz2 rneovim-fa99afe35eb5d8cf01d875e12b53165bf1104a60.zip