1 files changed, 22 insertions, 17 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 07bf574c17..df490ff3c9 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -523,12 +523,14 @@ int utf_ptr2cells(const char *p_in)
 }
 
 /// Convert a UTF-8 byte sequence to a character number.
-/// Doesn't handle ascii! only multibyte and illegal sequences.
+/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL)
+/// are treated like illegal sequences.
 ///
 /// @param[in]  p      String to convert.
 /// @param[in]  len    Length of the character in bytes, 0 or 1 if illegal.
 ///
-/// @return Unicode codepoint. A negative value when the sequence is illegal.
+/// @return Unicode codepoint. A negative value when the sequence is illegal (or
+///         ASCII, including NUL).
 int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
   FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
 {
@@ -1780,15 +1782,15 @@ int utf_head_off(const char *base_in, const char *p_in)
     start--;
   }
 
-  uint8_t cur_len = utf8len_tab[*start];
-  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
-  if (cur_code < 0) {
+  const uint8_t last_len = utf8len_tab[*start];
+  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len);
+  if (cur_code < 0 || p - start >= last_len) {
     return 0;  // p must be part of an illegal sequence
   }
-  const uint8_t * const safe_end = start + cur_len;
+  const uint8_t * const safe_end = start + last_len;
 
   int cur_bc = utf8proc_get_property(cur_code)->boundclass;
-  if (always_break(cur_bc)) {
+  if (always_break(cur_bc) || start == base) {
     return (int)(p - start);
   }
 
@@ -1796,18 +1798,23 @@ int utf_head_off(const char *base_in, const char *p_in)
   const uint8_t *cur_pos = start;
   const uint8_t *const p_start = start;
 
-  if (start == base) {
-    return (int)(p - start);
-  }
+  while (true) {
+    if (start[-1] == NUL) {
+      break;
+    }
+
+    start--;
+    if (*start < 0x80) {  // stop on ascii, we are done
+      break;
+    }
 
-  start--;
-  while (*start >= 0x80) {  // stop on ascii, we are done
     while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
       start--;
     }
 
-    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
-    if (prev_code < 0) {
+    int prev_len = utf8len_tab[*start];
+    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len);
+    if (prev_code < 0 || prev_len < cur_pos - start) {
       start = cur_pos;  // start at valid sequence after invalid bytes
       break;
     }
@@ -1822,12 +1829,10 @@ int utf_head_off(const char *base_in, const char *p_in)
     cur_pos = start;
     cur_bc = prev_bc;
     cur_code = prev_code;
-
-    start--;
   }
 
   // hot path: we are already on the first codepoint of a sequence
-  if (start == p_start) {
+  if (start == p_start && last_len > p - start) {
     return (int)(p - start);
   }