1 files changed, 304 insertions, 160 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index a345795bbe..01e720283e 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <utf8proc.h>
 #include <uv.h>
 #include <wctype.h>
 
@@ -43,6 +44,7 @@
 #include "nvim/cmdexpand_defs.h"
 #include "nvim/cursor.h"
 #include "nvim/drawscreen.h"
+#include "nvim/errors.h"
 #include "nvim/eval/typval.h"
 #include "nvim/eval/typval_defs.h"
 #include "nvim/getchar.h"
@@ -83,7 +85,6 @@ struct interval {
 // uncrustify:off
 #ifdef INCLUDE_GENERATED_DECLARATIONS
 # include "mbyte.c.generated.h"
-# include "unicode_tables.generated.h"
 #endif
 // uncrustify:on
 
@@ -442,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
   return utf_class_tab(utf_ptr2char(p), chartab);
 }
 
-// Return true if "c" is in "table".
-static bool intable(const struct interval *table, size_t n_items, int c)
-  FUNC_ATTR_PURE
+static bool prop_is_emojilike(const utf8proc_property_t *prop)
 {
-  assert(n_items > 0);
-  // first quick check for Latin1 etc. characters
-  if (c < table[0].first) {
-    return false;
-  }
-
-  assert(n_items <= SIZE_MAX / 2);
-  // binary search in table
-  size_t bot = 0;
-  size_t top = n_items;
-  do {
-    size_t mid = (bot + top) >> 1;
-    if (table[mid].last < c) {
-      bot = mid + 1;
-    } else if (table[mid].first > c) {
-      top = mid;
-    } else {
-      return true;
-    }
-  } while (top > bot);
-  return false;
+  return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+         || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR;
 }
 
 /// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
@@ -494,13 +474,18 @@ int utf_char2cells(int c)
     return n;
   }
 
-  if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
+  const utf8proc_property_t *prop = utf8proc_get_property(c);
+
+  if (prop->charwidth == 2) {
     return 2;
   }
-  if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
+  if (*p_ambw == 'd' && prop->ambiguous_width) {
     return 2;
   }
-  if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
+
+  // Characters below 1F000 may be considered single width traditionally,
+  // making them double width causes problems.
+  if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) {
     return 2;
   }
 
@@ -509,31 +494,43 @@ int utf_char2cells(int c)
 
 /// Return the number of display cells character at "*p" occupies.
 /// This doesn't take care of unprintable characters, use ptr2cells() for that.
-int utf_ptr2cells(const char *p)
+int utf_ptr2cells(const char *p_in)
 {
+  const uint8_t *p = (const uint8_t *)p_in;
   // Need to convert to a character number.
-  if ((uint8_t)(*p) >= 0x80) {
-    int c = utf_ptr2char(p);
+  if ((*p) >= 0x80) {
+    int len = utf8len_tab[*p];
+    int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
     // An illegal byte is displayed as <xx>.
-    if (utf_ptr2len(p) == 1 || c == NUL) {
+    if (c <= 0) {
       return 4;
     }
     // If the char is ASCII it must be an overlong sequence.
     if (c < 0x80) {
       return char2cells(c);
     }
-    return utf_char2cells(c);
+    int cells = utf_char2cells(c);
+    if (cells == 1 && p_emoji
+        && prop_is_emojilike(utf8proc_get_property(c))) {
+      int c2 = utf_ptr2char(p_in + len);
+      if (c2 == 0xFE0F) {
+        return 2;  // emoji presentation
+      }
+    }
+    return cells;
   }
   return 1;
 }
 
 /// Convert a UTF-8 byte sequence to a character number.
-/// Doesn't handle ascii! only multibyte and illegal sequences.
+/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL)
+/// are treated like illegal sequences.
 ///
 /// @param[in]  p      String to convert.
 /// @param[in]  len    Length of the character in bytes, 0 or 1 if illegal.
 ///
-/// @return Unicode codepoint. A negative value when the sequence is illegal.
+/// @return Unicode codepoint. A negative value when the sequence is illegal (or
+///         ASCII, including NUL).
 int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
   FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
 {
@@ -601,7 +598,8 @@ int utf_ptr2cells_len(const char *p, int size)
 {
   // Need to convert to a wide character.
   if (size > 0 && (uint8_t)(*p) >= 0x80) {
-    if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
+    int len = utf_ptr2len_len(p, size);
+    if (len < utf8len_tab[(uint8_t)(*p)]) {
       return 1;        // truncated
     }
     int c = utf_ptr2char(p);
@@ -613,7 +611,16 @@ int utf_ptr2cells_len(const char *p, int size)
     if (c < 0x80) {
       return char2cells(c);
     }
-    return utf_char2cells(c);
+    int cells = utf_char2cells(c);
+    if (cells == 1 && p_emoji && size > len
+        && prop_is_emojilike(utf8proc_get_property(c))
+        && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
+      int c2 = utf_ptr2char(p + len);
+      if (c2 == 0xFE0F) {
+        return 2;  // emoji presentation
+      }
+    }
+    return cells;
   }
   return 1;
 }
@@ -646,8 +653,8 @@ size_t mb_string2cells_len(const char *str, size_t size)
   size_t clen = 0;
 
   for (const char *p = str; *p != NUL && p < str + size;
-       p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) {
-    clen += (size_t)utf_ptr2cells(p);
+       p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) {
+    clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str));
   }
 
   return clen;
@@ -791,29 +798,48 @@ int mb_cptr2char_adv(const char **pp)
   return c;
 }
 
+/// When "c" is the first char of a string, determine if it needs to be prefixed
+/// by a space byte to be drawn correctly, and not merge with the space left of
+/// the string.
+bool utf_iscomposing_first(int c)
+{
+  return c >= 128 && !utf8proc_grapheme_break(' ', c);
+}
+
 /// Check if the character pointed to by "p2" is a composing character when it
-/// comes after "p1".  For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-bool utf_composinglike(const char *p1, const char *p2)
+/// comes after "p1".
+///
+/// We use the definition in UAX#29 as implemented by utf8proc with the following
+/// exceptions:
+///
+/// - ASCII chars always begin a new cluster. This is a long assumed invariant
+///   in the code base and very useful for performance (we can exit early for ASCII
+///   all over the place, branch predictor go brrr in ASCII-only text).
+///   As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII,
+///   which should be exceedingly rare (these PREPEND chars are expected to be
+///   followed by multibyte chars within the same script family)
+///
+/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with
+///   "c" taking one single cell, which behaves like a cluster.
+///
+/// @param "state" should be set to GRAPHEME_STATE_INIT before first call
+///        it is allowed to be null, but will then not handle some longer
+///        sequences, like ZWJ based emoji
+bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
+  FUNC_ATTR_NONNULL_ARG(1, 2)
 {
-  int c2 = utf_ptr2char(p2);
-  if (utf_iscomposing(c2)) {
-    return true;
-  }
-  if (!arabic_maycombine(c2)) {
+  if ((uint8_t)(*p2) < 128) {
     return false;
   }
-  return arabic_combine(utf_ptr2char(p1), c2);
-}
 
-/// Check if the next character is a composing character when it
-/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-/// returns false for negative values
-bool utf_char_composinglike(int32_t const first, int32_t const next)
-  FUNC_ATTR_PURE
-{
-  return utf_iscomposing(next) || arabic_combine(first, next);
+  int first = utf_ptr2char(p1);
+  int second = utf_ptr2char(p2);
+
+  if (!utf8proc_grapheme_break_stateful(first, second, state)) {
+    return true;
+  }
+
+  return arabic_combine(first, second);
 }
 
 /// Get the screen char at the beginning of a string
@@ -832,7 +858,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
 {
   int c = utf_ptr2char(p);
   *firstc = c;  // NOT optional, you are gonna need it
-  bool first_compose = utf_iscomposing(c);
+  bool first_compose = utf_iscomposing_first(c);
   size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
   size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
 
@@ -843,16 +869,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
   return schar_from_buf_first(p, len, first_compose);
 }
 
-/// Get the screen char at the beginning of a string with length
+/// Get the screen char from a char with a known length
 ///
 /// Like utfc_ptr2schar but use no more than p[maxlen].
-schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc)
   FUNC_ATTR_NONNULL_ALL
 {
-  assert(maxlen > 0);
-
-  size_t len = (size_t)utf_ptr2len_len(p, maxlen);
-  if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+  if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
     // invalid or truncated sequence
     *firstc = (uint8_t)(*p);
     return 0;
@@ -860,11 +883,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
 
   int c = utf_ptr2char(p);
   *firstc = c;
-  bool first_compose = utf_iscomposing(c);
-  maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
-  len = (size_t)utfc_ptr2len_len(p, maxlen);
+  bool first_compose = utf_iscomposing_first(c);
+  int maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+  if (len > maxlen) {
+    len = utfc_ptr2len_len(p, maxlen);
+  }
 
-  return schar_from_buf_first(p, len, first_compose);
+  return schar_from_buf_first(p, (size_t)len, first_compose);
 }
 
 /// Caller must ensure there is space for `first_compose`
@@ -962,8 +987,9 @@ int utfc_ptr2len(const char *const p)
 
   // Check for composing characters.
   int prevlen = 0;
+  GraphemeState state = GRAPHEME_STATE_INIT;
   while (true) {
-    if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
+    if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) {
       return len;
     }
 
@@ -994,9 +1020,10 @@ int utfc_ptr2len_len(const char *p, int size)
     return 1;
   }
 
-  // Check for composing characters.  We can handle only the first six, but
+  // Check for composing characters.  We can only display a limited amount, but
   // skip all of them (otherwise the cursor would get stuck).
   int prevlen = 0;
+  GraphemeState state = GRAPHEME_STATE_INIT;
   while (len < size) {
     if ((uint8_t)p[len] < 0x80) {
       break;
@@ -1009,7 +1036,7 @@ int utfc_ptr2len_len(const char *p, int size)
       break;
     }
 
-    if (!utf_composinglike(p + prevlen, p + len)) {
+    if (!utf_composinglike(p + prevlen, p + len, &state)) {
       break;
     }
 
@@ -1082,13 +1109,21 @@ int utf_char2bytes(const int c, char *const buf)
   }
 }
 
-/// Return true if "c" is a composing UTF-8 character.
-/// This means it will be drawn on top of the preceding character.
+/// Return true if "c" is a legacy composing UTF-8 character.
+///
+/// This is deprecated in favour of utf_composinglike() which uses the modern
+/// stateful algorithm to determine grapheme clusters. Still available
+/// to support some legacy code which hasn't been refactored yet.
+///
+/// To check if a char would combine with a preceeding space, use
+/// utf_iscomposing_first() instead.
+///
 /// Based on code from Markus Kuhn.
 /// Returns false for negative values.
-bool utf_iscomposing(int c)
+bool utf_iscomposing_legacy(int c)
 {
-  return intable(combining, ARRAY_SIZE(combining), c);
+  const utf8proc_property_t *prop = utf8proc_get_property(c);
+  return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME;
 }
 
 #ifdef __SSE2__
@@ -1133,6 +1168,33 @@ bool utf_printable(int c)
 
 #else
 
+// Return true if "c" is in "table".
+static bool intable(const struct interval *table, size_t n_items, int c)
+  FUNC_ATTR_PURE
+{
+  assert(n_items > 0);
+  // first quick check for Latin1 etc. characters
+  if (c < table[0].first) {
+    return false;
+  }
+
+  assert(n_items <= SIZE_MAX / 2);
+  // binary search in table
+  size_t bot = 0;
+  size_t top = n_items;
+  do {
+    size_t mid = (bot + top) >> 1;
+    if (table[mid].last < c) {
+      bot = mid + 1;
+    } else if (table[mid].first > c) {
+      top = mid;
+    } else {
+      return true;
+    }
+  } while (top > bot);
+  return false;
+}
+
 // Return true for characters that can be displayed in a normal way.
 // Only for characters of 0x100 and above!
 bool utf_printable(int c)
@@ -1255,8 +1317,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
     return 1;               // punctuation
   }
 
+  const utf8proc_property_t *prop = utf8proc_get_property(c);
   // emoji
-  if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+  if (prop_is_emojilike(prop)) {
     return 3;
   }
 
@@ -1276,47 +1339,51 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
   return 2;
 }
 
-bool utf_ambiguous_width(int c)
+bool utf_ambiguous_width(const char *p)
 {
-  return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
-                       || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
-}
+  // be quick if there is nothing to print or ASCII-only
+  if (p[0] == NUL || p[1] == NUL) {
+    return false;
+  }
 
-// Generic conversion function for case operations.
-// Return the converted equivalent of "a", which is a UCS-4 character.  Use
-// the given conversion "table".  Uses binary search on "table".
-static int utf_convert(int a, const convertStruct *const table, size_t n_items)
-{
-  // indices into table
-  size_t start = 0;
-  size_t end = n_items;
-  while (start < end) {
-    // need to search further
-    size_t mid = (end + start) / 2;
-    if (table[mid].rangeEnd < a) {
-      start = mid + 1;
-    } else {
-      end = mid;
+  CharInfo info = utf_ptr2CharInfo(p);
+  if (info.value >= 0x80) {
+    const utf8proc_property_t *prop = utf8proc_get_property(info.value);
+    if (prop->ambiguous_width || prop_is_emojilike(prop)) {
+      return true;
     }
   }
-  if (start < n_items
-      && table[start].rangeStart <= a
-      && a <= table[start].rangeEnd
-      && (a - table[start].rangeStart) % table[start].step == 0) {
-    return a + table[start].offset;
-  }
-  return a;
+
+  // check if second sequence is 0xFE0F VS-16 which can turn things into emoji,
+  // safe with NUL (no second sequence)
+  return memcmp(p + info.len, "\xef\xb8\x8f", 3) == 0;
 }
 
 // Return the folded-case equivalent of "a", which is a UCS-4 character.  Uses
-// simple case folding.
+// full case folding.
 int utf_fold(int a)
 {
   if (a < 0x80) {
     // be fast for ASCII
     return a >= 0x41 && a <= 0x5a ? a + 32 : a;
   }
-  return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
+
+  // TODO(dundargoc): utf8proc only does full case folding, which breaks some tests. This is a
+  // temporary workaround to circumvent failing tests.
+  //
+  // (0xdf) ß == ss in full casefolding. Using this however breaks the vim spell tests and the error
+  // E763 is thrown. This is due to the test spells relying on the vim spell files.
+  //
+  // (0x130) İ == i̇ in full casefolding.
+  if (a == 0xdf || a == 0x130) {
+    return a;
+  }
+
+  utf8proc_int32_t result[1];
+
+  utf8proc_ssize_t res = utf8proc_decompose_char(a, result, 1, UTF8PROC_CASEFOLD, NULL);
+
+  return (res == 1) ? result[0] : a;
 }
 
 // Vim's own character class functions.  These exist because many library
@@ -1324,9 +1391,6 @@ int utf_fold(int a)
 // invalid values or can't handle latin1 when the locale is C.
 // Speed is most important here.
 
-// Note: UnicodeData.txt does not define U+1E9E as being the corresponding upper
-// case letter for U+00DF (ß), however it is part of the toLower table
-
 /// Return the upper-case equivalent of "a", which is a UCS-4 character.  Use
 /// simple case folding.
 int mb_toupper(int a)
@@ -1345,14 +1409,12 @@ int mb_toupper(int a)
     return TOUPPER_LOC(a);
   }
 
-  // For any other characters use the above mapping table.
-  return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
+  return utf8proc_toupper(a);
 }
 
 bool mb_islower(int a)
 {
-  // German sharp s is lower case but has no upper case equivalent.
-  return (mb_toupper(a) != a) || a == 0xdf;
+  return mb_toupper(a) != a;
 }
 
 /// Return the lower-case equivalent of "a", which is a UCS-4 character.  Use
@@ -1373,8 +1435,7 @@ int mb_tolower(int a)
     return TOLOWER_LOC(a);
   }
 
-  // For any other characters use the above mapping table.
-  return utf_convert(a, toLower, ARRAY_SIZE(toLower));
+  return utf8proc_tolower(a);
 }
 
 bool mb_isupper(int a)
@@ -1388,7 +1449,7 @@ bool mb_isalpha(int a)
   return mb_islower(a) || mb_isupper(a);
 }
 
-static int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
+int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
 {
   int c1, c2;
   char buffer[6];
@@ -1545,7 +1606,7 @@ int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
     return uv_translate_sys_error(GetLastError());
   }
 
-  (*utf8)[bufsize] = '\0';
+  (*utf8)[bufsize] = NUL;
   return 0;
 }
 
@@ -1673,6 +1734,26 @@ void show_utf8(void)
   msg(IObuff, 0);
 }
 
+/// @return true if boundclass bc always starts a new cluster regardless of what's before
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break(int bc)
+{
+  return (bc == UTF8PROC_BOUNDCLASS_CONTROL);
+}
+
+/// @return true if bc2 always starts a cluster after bc1
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break_two(int bc1, int bc2)
+{
+  // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by
+  // "always_break" on first iteration or when it was bc1 in the previous iteration
+  return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER)
+          || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL)
+          || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+              && (bc1 == UTF8PROC_BOUNDCLASS_OTHER
+                  || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC)));
+}
+
 /// Return offset from "p" to the start of a character, including composing characters.
 /// "base" must be the start of the string, which must be NUL terminated.
 /// If "p" points to the NUL at the end of the string return 0.
@@ -1686,50 +1767,111 @@ int utf_head_off(const char *base_in, const char *p_in)
   const uint8_t *base = (uint8_t *)base_in;
   const uint8_t *p = (uint8_t *)p_in;
 
-  // Skip backwards over trailing bytes: 10xx.xxxx
-  // Skip backwards again if on a composing char.
-  const uint8_t *q;
-  for (q = p;; q--) {
-    // Move s to the last byte of this char.
-    const uint8_t *s;
-    for (s = q; (s[1] & 0xc0) == 0x80; s++) {}
-
-    // Move q to the first byte of this char.
-    while (q > base && (*q & 0xc0) == 0x80) {
-      q--;
-    }
-    // Check for illegal sequence. Do allow an illegal byte after where we
-    // started.
-    int len = utf8len_tab[*q];
-    if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
-      return 0;
+  const uint8_t *start = p;
+
+  // move start to the first byte of this codepoint
+  // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl
+  while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) {
+    start--;
+  }
+
+  const uint8_t last_len = utf8len_tab[*start];
+  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len);
+  if (cur_code < 0 || p - start >= last_len) {
+    return 0;  // p must be part of an illegal sequence
+  }
+  const uint8_t * const safe_end = start + last_len;
+
+  int cur_bc = utf8proc_get_property(cur_code)->boundclass;
+  if (always_break(cur_bc) || start == base) {
+    return (int)(p - start);
+  }
+
+  // backtrack to find the start of a cluster. we might go too far, checked in the next loop
+  const uint8_t *cur_pos = start;
+  const uint8_t *const p_start = start;
+
+  while (true) {
+    if (start[-1] == NUL) {
+      break;
     }
 
-    if (q <= base) {
+    start--;
+    if (*start < 0x80) {  // stop on ascii, we are done
       break;
     }
 
-    int c = utf_ptr2char((char *)q);
-    if (utf_iscomposing(c)) {
-      continue;
+    while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
+      start--;
     }
 
-    if (arabic_maycombine(c)) {
-      // Advance to get a sneak-peak at the next char
-      const uint8_t *j = q;
-      j--;
-      // Move j to the first byte of this char.
-      while (j > base && (*j & 0xc0) == 0x80) {
-        j--;
-      }
-      if (arabic_combine(utf_ptr2char((char *)j), c)) {
-        continue;
-      }
+    int prev_len = utf8len_tab[*start];
+    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len);
+    if (prev_code < 0 || prev_len < cur_pos - start) {
+      start = cur_pos;  // start at valid sequence after invalid bytes
+      break;
     }
-    break;
+
+    int prev_bc = utf8proc_get_property(prev_code)->boundclass;
+    if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) {
+      start = cur_pos;  // prev_code cannot be a part of this cluster
+      break;
+    } else if (start == base) {
+      break;
+    }
+    cur_pos = start;
+    cur_bc = prev_bc;
+    cur_code = prev_code;
+  }
+
+  // hot path: we are already on the first codepoint of a sequence
+  if (start == p_start && last_len > p - start) {
+    return (int)(p - start);
   }
 
-  return (int)(p - q);
+  const uint8_t *q = start;
+  while (q < p) {
+    // don't need to find end of cluster. once we reached the codepoint of p, we are done
+    int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q));
+
+    if (q + len > p) {
+      return (int)(p - q);
+    }
+
+    q += len;
+  }
+
+  return 0;
+}
+
+/// Assumes caller already handles ascii. see `utfc_next`
+StrCharInfo utfc_next_impl(StrCharInfo cur)
+{
+  int32_t prev_code = cur.chr.value;
+  uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
+  GraphemeState state = GRAPHEME_STATE_INIT;
+  assert(*next >= 0x80);
+
+  while (true) {
+    uint8_t const next_len = utf8len_tab[*next];
+    int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
+    if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
+        && !arabic_combine(prev_code, next_code)) {
+      return (StrCharInfo){
+        .ptr = (char *)next,
+        .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
+      };
+    }
+
+    prev_code = next_code;
+    next += next_len;
+    if (EXPECT(*next < 0x80U, true)) {
+      return (StrCharInfo){
+        .ptr = (char *)next,
+        .chr = (CharInfo){ .value = *next, .len = 1 },
+      };
+    }
+  }
 }
 
 // Whether space is NOT allowed before/after 'c'.
@@ -2688,7 +2830,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
             c = 0x100; break;                   // not in latin9
           }
         }
-        if (!utf_iscomposing(c)) {              // skip composing chars
+        if (!utf_iscomposing_legacy(c)) {  // skip composing chars
           if (c < 0x100) {
             *d++ = (uint8_t)c;
           } else if (vcp->vc_fail) {
@@ -2776,17 +2918,17 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
     emsg(_(e_listreq));
     return;
   }
+
   const list_T *const l = argvars[0].vval.v_list;
-  if (tv_list_len(l) == 0) {
+  cw_interval_T *table = NULL;
+  const size_t table_size = (size_t)tv_list_len(l);
+  if (table_size == 0) {
     // Clearing the table.
-    xfree(cw_table);
-    cw_table = NULL;
-    cw_table_size = 0;
-    return;
+    goto update;
   }
 
   // Note: use list_T instead of listitem_T so that TV_LIST_ITEM_NEXT can be used properly below.
-  const list_T **ptrs = xmalloc(sizeof(const list_T *) * (size_t)tv_list_len(l));
+  const list_T **ptrs = xmalloc(sizeof(const list_T *) * table_size);
 
   // Check that all entries are a list with three numbers, the range is
   // valid and the cell width is valid.
@@ -2838,12 +2980,12 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
   });
 
   // Sort the list on the first number.
-  qsort((void *)ptrs, (size_t)tv_list_len(l), sizeof(const list_T *), tv_nr_compare);
+  qsort((void *)ptrs, table_size, sizeof(const list_T *), tv_nr_compare);
 
-  cw_interval_T *table = xmalloc(sizeof(cw_interval_T) * (size_t)tv_list_len(l));
+  table = xmalloc(sizeof(cw_interval_T) * table_size);
 
   // Store the items in the new table.
-  for (item = 0; item < tv_list_len(l); item++) {
+  for (item = 0; (size_t)item < table_size; item++) {
     const list_T *const li_l = ptrs[item];
     const listitem_T *lili = tv_list_first(li_l);
     const varnumber_T n1 = TV_LIST_ITEM_TV(lili)->vval.v_number;
@@ -2862,10 +3004,12 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
 
   xfree((void *)ptrs);
 
+update:
+  ;
   cw_interval_T *const cw_table_save = cw_table;
   const size_t cw_table_size_save = cw_table_size;
   cw_table = table;
-  cw_table_size = (size_t)tv_list_len(l);
+  cw_table_size = table_size;
 
   // Check that the new value does not conflict with 'listchars' or
   // 'fillchars'.