feat(mbyte): support extended grapheme clusters including more emoji

Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible.
author: bfredl <bjorn.linse@gmail.com> 2024-08-08 10:42:08 +0200
committer: bfredl <bjorn.linse@gmail.com> 2024-08-30 11:49:09 +0200
commit: cfdf68a7acde16597fbd896674af68c42361102c (patch)
tree: 6113193fda7a7c0f94577a464e39964e74311583 /src
parent: 4353996d0fa8e5872a334d68196d8088391960cf (diff)
download: rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.gz
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.bz2
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.zip
23 files changed, 345 insertions, 176 deletions
diff --git a/src/nvim/api/extmark.c b/src/nvim/api/extmark.c
index 1673519479..d694b64f66 100644
--- a/src/nvim/api/extmark.c
+++ b/src/nvim/api/extmark.c
@@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer
     String c = opts->conceal;
     if (c.size > 0) {
       int ch;
-      hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch);
+      hl.conceal_char = utfc_ptr2schar(c.data, &ch);
       if (!hl.conceal_char || !vim_isprintc(ch)) {
         api_set_error(err, kErrorTypeValidation, "conceal char has to be printable");
         goto error;
diff --git a/src/nvim/api/ui.c b/src/nvim/api/ui.c
index 82a5ff5f8e..a99d97acb8 100644
--- a/src/nvim/api/ui.c
+++ b/src/nvim/api/ui.c
@@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco
       char sc_buf[MAX_SCHAR_SIZE];
       schar_get(sc_buf, chunk[i]);
       remote_ui_put(ui, sc_buf);
-      if (utf_ambiguous_width(utf_ptr2char(sc_buf))) {
+      if (utf_ambiguous_width(sc_buf)) {
         ui->client_col = -1;  // force cursor update
       }
     }
diff --git a/src/nvim/change.c b/src/nvim/change.c
index 6e9fab5a9b..47a9f0ce92 100644
--- a/src/nvim/change.c
+++ b/src/nvim/change.c
@@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
   // delete the last combining character.
   if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
     char *p0 = oldp + col;
-    if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
+    GraphemeState state = GRAPHEME_STATE_INIT;
+    if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) {
       // Find the last composing char, there can be several.
       int n = col;
       do {
         col = n;
         count = utf_ptr2len(oldp + n);
         n += count;
-      } while (utf_composinglike(oldp + col, oldp + n));
+      } while (utf_composinglike(oldp + col, oldp + n, &state));
       fixpos = false;
     }
   }
@@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
     }
     if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) {
       while ((*p_extra == ' ' || *p_extra == '\t')
-             && !utf_iscomposing(utf_ptr2char(p_extra + 1))) {
+             && !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) {
         if (REPLACE_NORMAL(State)) {
           replace_push(*p_extra);
         }
diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c
index 8149c5964f..7413d33fe4 100644
--- a/src/nvim/digraph.c
+++ b/src/nvim/digraph.c
@@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous)
   p = buf;
 
   // add a space to draw a composing char on
-  if (utf_iscomposing(dp->result)) {
+  if (utf_iscomposing_first(dp->result)) {
     *p++ = ' ';
   }
   p += utf_char2bytes(dp->result, p);
diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c
index 8a948716e5..b5273a54ca 100644
--- a/src/nvim/drawline.c
+++ b/src/nvim/drawline.c
@@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
 
         // If a double-width char doesn't fit display a '>' in the last column.
         // Don't advance the pointer but put the character at the start of the next line.
-        if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
+        if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
           mb_c = '>';
           mb_l = 1;
           (void)mb_l;
@@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
       // If a double-width char doesn't fit display a '>' in the
       // last column; the character is displayed at the start of the
       // next line.
-      if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
+      if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
         mb_schar = schar_from_ascii('>');
         mb_c = '>';
         mb_l = 1;
@@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
                 || (decor_conceal && decor_state.conceal_char)
                 || wp->w_p_cole == 1)
             && wp->w_p_cole != 3) {
+          if (schar_cells(mb_schar) > 1) {
+            // When the first char to be concealed is double-width,
+            // need to advance one more virtual column.
+            wlv.n_extra++;
+          }
+
           // First time at this concealed item: display one
           // character.
           if (has_match_conc && match_conc) {
@@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
             mb_schar = schar_from_ascii(' ');
           }
 
-          if (utf_char2cells(mb_c) > 1) {
-            // When the first char to be concealed is double-width,
-            // need to advance one more virtual column.
-            wlv.n_extra++;
-          }
-
           mb_c = schar_get_first_codepoint(mb_schar);
 
           prev_syntax_id = syntax_seqnr;
@@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
         && mb_schar != NUL) {
       mb_schar = wp->w_p_lcs_chars.prec;
       lcs_prec_todo = NUL;
-      if (utf_char2cells(mb_c) > 1) {
+      if (schar_cells(mb_schar) > 1) {
         // Double-width character being overwritten by the "precedes"
         // character, need to fill up half the character.
         wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR);
@@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
 
       linebuf_vcol[wlv.off] = wlv.vcol;
 
-      if (utf_char2cells(mb_c) > 1) {
+      if (schar_cells(mb_schar) > 1) {
         // Need to fill two screen columns.
         wlv.off++;
         wlv.col++;
@@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
       wlv.off++;
       wlv.col++;
     } else if (wp->w_p_cole > 0 && is_concealing) {
-      bool concealed_wide = utf_char2cells(mb_c) > 1;
+      bool concealed_wide = schar_cells(mb_schar) > 1;
 
       wlv.skip_cells--;
       wlv.vcol_off_co++;
diff --git a/src/nvim/edit.c b/src/nvim/edit.c
index 00ce38c4b1..f8723f9680 100644
--- a/src/nvim/edit.c
+++ b/src/nvim/edit.c
@@ -2832,6 +2832,8 @@ int replace_push_mb(char *p)
 {
   int l = utfc_ptr2len(p);
 
+  // TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping.
+  // or just keep a secondary array with char byte lenghts
   for (int j = l - 1; j >= 0; j--) {
     replace_push(p[j]);
   }
@@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc)
     for (int i = 1; i < n; i++) {
       buf[i] = (uint8_t)replace_pop();
     }
-    if (utf_iscomposing(utf_ptr2char((char *)buf))) {
+    // TODO(bfredl): by fixing replace_push_mb, upgrade to use
+    // the new composing algorithm
+    if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) {
       ins_bytes_len((char *)buf, (size_t)n);
     } else {
       // Not a composing char, put it back.
@@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
           space_sci = sci;
           space_vcol = vcol;
         }
-        vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value);
+        vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value);
         sci = utfc_next(sci);
         prev_space = cur_space;
       }
@@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
       // Find the position to stop backspacing.
       // Use charsize_nowrap() so that virtual text and wrapping are ignored.
       while (true) {
-        int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value);
+        int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value);
         if (space_vcol + size > want_vcol) {
           break;
         }
@@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
           bool has_composing = false;
           if (p_deco) {
             char *p0 = get_cursor_pos_ptr();
-            has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
+            has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL);
           }
           del_char(false);
           // If there are combining characters and 'delcombine' is set
diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c
index 6ac73527ee..1b6861f750 100644
--- a/src/nvim/ex_cmds.c
+++ b/src/nvim/ex_cmds.c
@@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap)
       IObuff[iobuff_len++] = ' ';
     }
     IObuff[iobuff_len++] = '<';
-    if (utf_iscomposing(c)) {
+    if (utf_iscomposing_first(c)) {
       IObuff[iobuff_len++] = ' ';  // Draw composing char on top of a space.
     }
     iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c
index 8a34e03d91..722a857f03 100644
--- a/src/nvim/ex_getln.c
+++ b/src/nvim/ex_getln.c
@@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s)
     s->do_abbr = false;                   // don't do abbreviation now
     ccline.special_char = NUL;
     // may need to remove ^ when composing char was typed
-    if (utf_iscomposing(s->c) && !cmd_silent) {
+    if (utf_iscomposing_first(s->c) && !cmd_silent) {
       if (ui_has(kUICmdline)) {
         // TODO(bfredl): why not make unputcmdline also work with true?
         unputcmdline();
@@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw)
     // backup to the character before it.  There could be two of them.
     int i = 0;
     int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos);
-    while (ccline.cmdpos > 0 && utf_iscomposing(c)) {
+    // TODO(bfredl): this can be corrected/simplified as utf_head_off implements the
+    // correct grapheme cluster breaks
+    while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) {
       i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1;
       ccline.cmdpos -= i;
       len += i;
diff --git a/src/nvim/grid.c b/src/nvim/grid.c
index 56246bf001..acb336c725 100644
--- a/src/nvim/grid.c
+++ b/src/nvim/grid.c
@@ -186,6 +186,24 @@ size_t schar_len(schar_T sc)
   }
 }
 
+int schar_cells(schar_T sc)
+{
+  // hot path
+#ifdef ORDER_BIG_ENDIAN
+  if (!(sc & 0x80FFFFFF)) {
+    return 1;
+  }
+#else
+  if (sc < 0x80) {
+    return 1;
+  }
+#endif
+
+  char sc_buf[MAX_SCHAR_SIZE];
+  schar_get(sc_buf, sc);
+  return utf_ptr2cells(sc_buf);
+}
+
 /// gets first raw UTF-8 byte of an schar
 static char schar_get_first_byte(schar_T sc)
 {
@@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
   const int max_col = grid_line_maxcol;
   while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
     // check if this is the first byte of a multibyte
-    int mbyte_blen = len > 0
-                     ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
-                     : utfc_ptr2len(ptr);
+    int mbyte_blen;
+    if (len >= 0) {
+      int maxlen = (int)((text + len) - ptr);
+      mbyte_blen = utfc_ptr2len_len(ptr, maxlen);
+      if (mbyte_blen > maxlen) {
+        mbyte_blen = 1;
+      }
+    } else {
+      mbyte_blen = utfc_ptr2len(ptr);
+    }
     int firstc;
-    schar_T schar = len >= 0
-                    ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
-                    : utfc_ptr2schar(ptr, &firstc);
-    int mbyte_cells = utf_char2cells(firstc);
+    schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc);
+    int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen);
     if (mbyte_cells > 2 || schar == 0) {
       mbyte_cells = 1;
       schar = schar_from_char(0xFFFD);
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 0c1b537f3a..666a904fc5 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -511,20 +511,30 @@ int utf_char2cells(int c)
 
 /// Return the number of display cells character at "*p" occupies.
 /// This doesn't take care of unprintable characters, use ptr2cells() for that.
-int utf_ptr2cells(const char *p)
+int utf_ptr2cells(const char *p_in)
 {
+  const uint8_t *p = (const uint8_t *)p_in;
   // Need to convert to a character number.
-  if ((uint8_t)(*p) >= 0x80) {
-    int c = utf_ptr2char(p);
+  if ((*p) >= 0x80) {
+    int len = utf8len_tab[*p];
+    int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
     // An illegal byte is displayed as <xx>.
-    if (utf_ptr2len(p) == 1 || c == NUL) {
+    if (c <= 0) {
       return 4;
     }
     // If the char is ASCII it must be an overlong sequence.
     if (c < 0x80) {
       return char2cells(c);
     }
-    return utf_char2cells(c);
+    int cells = utf_char2cells(c);
+    if (cells == 1 && p_emoji
+        && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+      int c2 = utf_ptr2char(p_in + len);
+      if (c2 == 0xFE0F) {
+        return 2;  // emoji presentation
+      }
+    }
+    return cells;
   }
   return 1;
 }
@@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size)
 {
   // Need to convert to a wide character.
   if (size > 0 && (uint8_t)(*p) >= 0x80) {
-    if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
+    int len = utf_ptr2len_len(p, size);
+    if (len < utf8len_tab[(uint8_t)(*p)]) {
       return 1;        // truncated
     }
     int c = utf_ptr2char(p);
@@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size)
     if (c < 0x80) {
       return char2cells(c);
     }
-    return utf_char2cells(c);
+    int cells = utf_char2cells(c);
+    if (cells == 1 && p_emoji && size > len
+        && intable(emoji_all, ARRAY_SIZE(emoji_all), c)
+        && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
+      int c2 = utf_ptr2char(p + len);
+      if (c2 == 0xFE0F) {
+        return 2;  // emoji presentation
+      }
+    }
+    return cells;
   }
   return 1;
 }
@@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size)
   size_t clen = 0;
 
   for (const char *p = str; *p != NUL && p < str + size;
-       p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) {
-    clen += (size_t)utf_ptr2cells(p);
+       p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) {
+    clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str));
   }
 
   return clen;
@@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp)
   return c;
 }
 
+/// When "c" is the first char of a string, determine if it needs to be prefixed
+/// by a space byte to be drawn correctly, and not merge with the space left of
+/// the string.
+bool utf_iscomposing_first(int c)
+{
+  return c >= 128 && !utf8proc_grapheme_break(' ', c);
+}
+
 /// Check if the character pointed to by "p2" is a composing character when it
-/// comes after "p1".  For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-bool utf_composinglike(const char *p1, const char *p2)
+/// comes after "p1".
+///
+/// We use the definition in UAX#29 as implemented by utf8proc with the following
+/// exceptions:
+///
+/// - ASCII chars always begin a new cluster. This is a long assumed invariant
+///   in the code base and very useful for performance (we can exit early for ASCII
+///   all over the place, branch predictor go brrr in ASCII-only text).
+///   As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII,
+///   which should be exceedingly rare (these PREPEND chars are expected to be
+///   followed by multibyte chars within the same script family)
+///
+/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with
+///   "c" taking one single cell, which behaves like a cluster.
+///
+/// @param "state" should be set to GRAPHEME_STATE_INIT before first call
+///        it is allowed to be null, but will then not handle some longer
+///        sequences, like ZWJ based emoji
+bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
+  FUNC_ATTR_NONNULL_ARG(1, 2)
 {
-  int c2 = utf_ptr2char(p2);
-  if (utf_iscomposing(c2)) {
-    return true;
-  }
-  if (!arabic_maycombine(c2)) {
+  if ((uint8_t)(*p2) < 128) {
     return false;
   }
-  return arabic_combine(utf_ptr2char(p1), c2);
-}
 
-/// Check if the next character is a composing character when it
-/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-/// returns false for negative values
-bool utf_char_composinglike(int32_t const first, int32_t const next)
-  FUNC_ATTR_PURE
-{
-  return utf_iscomposing(next) || arabic_combine(first, next);
+  int first = utf_ptr2char(p1);
+  int second = utf_ptr2char(p2);
+
+  if (!utf8proc_grapheme_break_stateful(first, second, state)) {
+    return true;
+  }
+
+  return arabic_combine(first, second);
 }
 
 /// Get the screen char at the beginning of a string
@@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
 {
   int c = utf_ptr2char(p);
   *firstc = c;  // NOT optional, you are gonna need it
-  bool first_compose = utf_iscomposing(c);
+  bool first_compose = utf_iscomposing_first(c);
   size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
   size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
 
@@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
   return schar_from_buf_first(p, len, first_compose);
 }
 
-/// Get the screen char at the beginning of a string with length
+/// Get the screen char from a char with a known length
 ///
 /// Like utfc_ptr2schar but use no more than p[maxlen].
-schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc)
   FUNC_ATTR_NONNULL_ALL
 {
-  assert(maxlen > 0);
-
-  size_t len = (size_t)utf_ptr2len_len(p, maxlen);
-  if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+  if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
     // invalid or truncated sequence
     *firstc = (uint8_t)(*p);
     return 0;
@@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
 
   int c = utf_ptr2char(p);
   *firstc = c;
-  bool first_compose = utf_iscomposing(c);
-  maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
-  len = (size_t)utfc_ptr2len_len(p, maxlen);
+  bool first_compose = utf_iscomposing_first(c);
+  int maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+  if (len > maxlen) {
+    len = utfc_ptr2len_len(p, maxlen);
+  }
 
-  return schar_from_buf_first(p, len, first_compose);
+  return schar_from_buf_first(p, (size_t)len, first_compose);
 }
 
 /// Caller must ensure there is space for `first_compose`
@@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p)
 
   // Check for composing characters.
   int prevlen = 0;
+  GraphemeState state = GRAPHEME_STATE_INIT;
   while (true) {
-    if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
+    if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) {
       return len;
     }
 
@@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size)
     return 1;
   }
 
-  // Check for composing characters.  We can handle only the first six, but
+  // Check for composing characters.  We can only display a limited amount, but
   // skip all of them (otherwise the cursor would get stuck).
   int prevlen = 0;
+  GraphemeState state = GRAPHEME_STATE_INIT;
   while (len < size) {
     if ((uint8_t)p[len] < 0x80) {
       break;
@@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size)
       break;
     }
 
-    if (!utf_composinglike(p + prevlen, p + len)) {
+    if (!utf_composinglike(p + prevlen, p + len, &state)) {
       break;
     }
 
@@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf)
   }
 }
 
-/// Return true if "c" is a composing UTF-8 character.
-/// This means it will be drawn on top of the preceding character.
+/// Return true if "c" is a legacy composing UTF-8 character.
+///
+/// This is deprecated in favour of utf_composinglike() which uses the modern
+/// stateful algorithm to determine grapheme clusters. Still available
+/// to support some legacy code which hasn't been refactored yet.
+///
+/// To check if a char would combine with a preceeding space, use
+/// utf_iscomposing_first() instead.
+///
 /// Based on code from Markus Kuhn.
 /// Returns false for negative values.
-bool utf_iscomposing(int c)
+bool utf_iscomposing_legacy(int c)
 {
   return intable(combining, ARRAY_SIZE(combining), c);
 }
@@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
   return 2;
 }
 
-bool utf_ambiguous_width(int c)
+bool utf_ambiguous_width(const char *p)
 {
+  int c = utf_ptr2char(p);
   return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
                        || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
 }
@@ -1666,6 +1714,26 @@ void show_utf8(void)
   msg(IObuff, 0);
 }
 
+/// @return true if boundclass bc always starts a new cluster regardless of what's before
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break(int bc)
+{
+  return (bc == UTF8PROC_BOUNDCLASS_CONTROL);
+}
+
+/// @return true if bc2 always starts a cluster after bc1
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break_two(int bc1, int bc2)
+{
+  // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by
+  // "always_break" on first iteration or when it was bc1 in the previous iteration
+  return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER)
+          || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL)
+          || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+              && (bc1 == UTF8PROC_BOUNDCLASS_OTHER
+                  || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC)));
+}
+
 /// Return offset from "p" to the start of a character, including composing characters.
 /// "base" must be the start of the string, which must be NUL terminated.
 /// If "p" points to the NUL at the end of the string return 0.
@@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in)
   const uint8_t *base = (uint8_t *)base_in;
   const uint8_t *p = (uint8_t *)p_in;
 
-  // Skip backwards over trailing bytes: 10xx.xxxx
-  // Skip backwards again if on a composing char.
-  const uint8_t *q;
-  for (q = p;; q--) {
-    // Move s to the last byte of this char.
-    const uint8_t *s;
-    for (s = q; (s[1] & 0xc0) == 0x80; s++) {}
-
-    // Move q to the first byte of this char.
-    while (q > base && (*q & 0xc0) == 0x80) {
-      q--;
-    }
-    // Check for illegal sequence. Do allow an illegal byte after where we
-    // started.
-    int len = utf8len_tab[*q];
-    if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
-      return 0;
+  const uint8_t *start = p;
+
+  // move start to the first byte of this codepoint
+  // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl
+  while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) {
+    start--;
+  }
+
+  uint8_t cur_len = utf8len_tab[*start];
+  int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
+  if (cur_code < 0) {
+    return 0;  // p must be part of an illegal sequence
+  }
+  const uint8_t * const safe_end = start + cur_len;
+
+  int cur_bc = utf8proc_get_property(cur_code)->boundclass;
+  if (always_break(cur_bc)) {
+    return (int)(p - start);
+  }
+
+  // backtrack to find the start of a cluster. we might go too far, checked in the next loop
+  const uint8_t *cur_pos = start;
+  const uint8_t *const p_start = start;
+
+  if (start == base) {
+    return (int)(p - start);
+  }
+
+  start--;
+  while (*start >= 0x80) {  // stop on ascii, we are done
+    while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
+      start--;
     }
 
-    if (q <= base) {
+    int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
+    if (prev_code < 0) {
+      start = cur_pos;  // start at valid sequence after invalid bytes
       break;
     }
 
-    int c = utf_ptr2char((char *)q);
-    if (utf_iscomposing(c)) {
-      continue;
+    int prev_bc = utf8proc_get_property(prev_code)->boundclass;
+    if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) {
+      start = cur_pos;  // prev_code cannot be a part of this cluster
+      break;
+    } else if (start == base) {
+      break;
     }
+    cur_pos = start;
+    cur_bc = prev_bc;
+    cur_code = prev_code;
 
-    if (arabic_maycombine(c)) {
-      // Advance to get a sneak-peak at the next char
-      const uint8_t *j = q;
-      j--;
-      // Move j to the first byte of this char.
-      while (j > base && (*j & 0xc0) == 0x80) {
-        j--;
-      }
-      if (arabic_combine(utf_ptr2char((char *)j), c)) {
-        continue;
-      }
+    start--;
+  }
+
+  // hot path: we are already on the first codepoint of a sequence
+  if (start == p_start) {
+    return (int)(p - start);
+  }
+
+  const uint8_t *q = start;
+  while (q < p) {
+    // don't need to find end of cluster. once we reached the codepoint of p, we are done
+    int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q));
+
+    if (q + len > p) {
+      return (int)(p - q);
     }
-    break;
+
+    q += len;
   }
 
-  return (int)(p - q);
+  return 0;
+}
+
+/// Assumes caller already handles ascii. see `utfc_next`
+StrCharInfo utfc_next_impl(StrCharInfo cur)
+{
+  int32_t prev_code = cur.chr.value;
+  uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
+  GraphemeState state = GRAPHEME_STATE_INIT;
+  assert(*next >= 0x80);
+
+  while (true) {
+    uint8_t const next_len = utf8len_tab[*next];
+    int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
+    if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
+        && !arabic_combine(prev_code, next_code)) {
+      return (StrCharInfo){
+        .ptr = (char *)next,
+        .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
+      };
+    }
+
+    prev_code = next_code;
+    next += next_len;
+    if (EXPECT(*next < 0x80U, true)) {
+      return (StrCharInfo){
+        .ptr = (char *)next,
+        .chr = (CharInfo){ .value = *next, .len = 1 },
+      };
+    }
+  }
 }
 
 // Whether space is NOT allowed before/after 'c'.
@@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
             c = 0x100; break;                   // not in latin9
           }
         }
-        if (!utf_iscomposing(c)) {              // skip composing chars
+        if (!utf_iscomposing_legacy(c)) {  // skip composing chars
           if (c < 0x100) {
             *d++ = (uint8_t)c;
           } else if (vcp->vc_fail) {
diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h
index 6cbfbcbc3c..2da051fca2 100644
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@@ -3,6 +3,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <sys/types.h>  // IWYU pragma: keep
+#include <utf8proc.h>
 #include <uv.h>  // IWYU pragma: keep
 
 #include "nvim/cmdexpand_defs.h"  // IWYU pragma: keep
@@ -11,6 +12,9 @@
 #include "nvim/mbyte_defs.h"  // IWYU pragma: keep
 #include "nvim/types_defs.h"  // IWYU pragma: keep
 
+typedef utf8proc_int32_t GraphemeState;
+#define GRAPHEME_STATE_INIT 0
+
 #ifdef INCLUDE_GENERATED_DECLARATIONS
 # include "mbyte.h.generated.h"
 # include "mbyte.h.inline.generated.h"
@@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
 static inline StrCharInfo utfc_next(StrCharInfo cur)
   FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
 {
-  int32_t prev_code = cur.chr.value;
+  // handle ASCII case inline
   uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
-
-  while (true) {
-    if (EXPECT(*next < 0x80U, true)) {
-      return (StrCharInfo){
-        .ptr = (char *)next,
-        .chr = (CharInfo){ .value = *next, .len = 1 },
-      };
-    }
-    uint8_t const next_len = utf8len_tab[*next];
-    int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
-    if (!utf_char_composinglike(prev_code, next_code)) {
-      return (StrCharInfo){
-        .ptr = (char *)next,
-        .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
-      };
-    }
-
-    prev_code = next_code;
-    next += next_len;
+  if (EXPECT(*next < 0x80U, true)) {
+    return (StrCharInfo){
+      .ptr = (char *)next,
+      .chr = (CharInfo){ .value = *next, .len = 1 },
+    };
   }
+
+  return utfc_next_impl(cur);
 }
 
 static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)
diff --git a/src/nvim/message.c b/src/nvim/message.c
index 53e5511a5a..79e6bc8be7 100644
--- a/src/nvim/message.c
+++ b/src/nvim/message.c
@@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen)
   // Last part: End of the string.
   half = i = (int)strlen(s);
   while (true) {
-    do {
-      half = half - utf_head_off(s, s + half - 1) - 1;
-    } while (half > 0 && utf_iscomposing(utf_ptr2char(s + half)));
+    half = half - utf_head_off(s, s + half - 1) - 1;
     n = ptr2cells(s + half);
     if (len + n > room || half == 0) {
       break;
diff --git a/src/nvim/normal.c b/src/nvim/normal.c
index f44a64af21..f3bdea9a85 100644
--- a/src/nvim/normal.c
+++ b/src/nvim/normal.c
@@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s)
       while ((s->c = vpeekc()) > 0
              && (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) {
         s->c = plain_vgetc();
-        if (!utf_iscomposing(s->c)) {
+        // TODO(bfredl): only allowing up to two composing chars is cringe af.
+        // Could reuse/abuse schar_T to at least allow us to input anything we are able
+        // to display and use the stateful utf8proc algorithm like utf_composinglike
+        if (!utf_iscomposing_legacy(s->c)) {
           vungetc(s->c);                   // it wasn't, put it back
           break;
         } else if (s->ca.ncharC1 == 0) {
diff --git a/src/nvim/options.lua b/src/nvim/options.lua
index 3612a80fb8..1c17b0fc9f 100644
--- a/src/nvim/options.lua
+++ b/src/nvim/options.lua
@@ -2326,9 +2326,12 @@ return {
       desc = [=[
         When on all Unicode emoji characters are considered to be full width.
         This excludes "text emoji" characters, which are normally displayed as
-        single width.  Unfortunately there is no good specification for this
-        and it has been determined on trial-and-error basis.  Use the
-        |setcellwidths()| function to change the behavior.
+        single width. However, such "text emoji" are treated as full-width
+        emoji if they are followed by the U+FE0F variant selector.
+
+        Unfortunately there is no good specification for this and it has been
+        determined on trial-and-error basis.  Use the |setcellwidths()|
+        function to change the behavior.
       ]=],
       full_name = 'emoji',
       redraw = { 'all_windows', 'ui_option' },
diff --git a/src/nvim/plines.c b/src/nvim/plines.c
index e51e9bf8c3..408fe26bf3 100644
--- a/src/nvim/plines.c
+++ b/src/nvim/plines.c
@@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
   } else if (cur_char < 0) {
     size = kInvalidByteCells;
   } else {
-    size = char2cells(cur_char);
+    size = ptr2cells(cur);
     is_doublewidth = size == 2 && cur_char > 0x80;
   }
 
@@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
 ///
 /// @see charsize_regular
 /// @see charsize_fast
-static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol,
-                                          int32_t const cur_char)
+static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop,
+                                          colnr_T const vcol, int32_t const cur_char)
   FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE
 {
   // A tab gets expanded, depending on the current column
@@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
     if (cur_char < 0) {
       width = kInvalidByteCells;
     } else {
-      width = char2cells(cur_char);
+      // TODO(bfredl): perf: often cur_char is enough at this point to determine width.
+      // we likely want a specialized version of utf_ptr2StrCharInfo also determining
+      // the ptr2cells width at the same time without any extra decoding. (also applies
+      // to charsize_regular and charsize_nowrap)
+      width = ptr2cells(cur);
     }
 
     // If a double-width char doesn't fit at the end of a line, it wraps to the next line,
@@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
 /// Can be used if CSType is kCharsizeFast.
 ///
 /// @see charsize_regular
-CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char)
+CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char)
   FUNC_ATTR_PURE
 {
-  return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char);
+  return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char);
 }
 
 /// Get the number of cells taken up on the screen at given virtual column.
 ///
 /// @see win_chartabsize()
-int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char)
+int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char)
 {
   if (cur_char == TAB && use_tabstop) {
     return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array);
   } else if (cur_char < 0) {
     return kInvalidByteCells;
   } else {
-    return char2cells(cur_char);
+    return ptr2cells(cur);
   }
 }
 
@@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le
 
   StrCharInfo ci = utf_ptr2StrCharInfo(line);
   while (ci.ptr - line < len && *ci.ptr != NUL) {
-    vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width;
+    vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width;
     ci = utfc_next(ci);
     if (vcol > MAXCOL) {
       vcol_arg = MAXCOL;
@@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en
         char_size = (CharSize){ .width = 1 };
         break;
       }
-      char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value);
+      char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value);
       StrCharInfo const next = utfc_next(ci);
       if (next.ptr - line > end_col) {
         break;
@@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e
     if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) {
       int c = utf_ptr2char(ptr + pos->col);
       if ((c != TAB) && vim_isprintc(c)) {
-        endadd = (colnr_T)(char2cells(c) - 1);
+        endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1);
         if (coladd > endadd) {
           // past end of line
           endadd = 0;
@@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column)
   if (cstype == kCharsizeFast) {
     bool const use_tabstop = csarg.use_tabstop;
     while (*ci.ptr != NUL && --column >= 0) {
-      vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width;
+      vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width;
       ci = utfc_next(ci);
     }
   } else {
diff --git a/src/nvim/plines.h b/src/nvim/plines.h
index 7128e37237..50310b8ce1 100644
--- a/src/nvim/plines.h
+++ b/src/nvim/plines.h
@@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t
   FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
 {
   if (cstype == kCharsizeFast) {
-    return charsize_fast(csarg, vcol, chr);
+    return charsize_fast(csarg, ptr, vcol, chr);
   } else {
     return charsize_regular(csarg, ptr, vcol, chr);
   }
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index 7dbbb19545..c91c112c3c 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c)
 {
   return utf_char2len(c) > 1
          && (re_multi_type(peekchr()) != NOT_MULTI
-             || utf_iscomposing(c));
+             || utf_iscomposing_legacy(c));
 }
 
 // Emit (if appropriate) a byte of code
@@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp)
     }
     // When '.' is followed by a composing char ignore the dot, so that
     // the composing char is matched here.
-    if (c == Magic('.') && utf_iscomposing(peekchr())) {
+    if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
       c = getchr();
       goto do_multibyte;
     }
@@ -5001,9 +5001,10 @@ do_multibyte:
           int l;
 
           // Need to get composing character too.
+          GraphemeState state = GRAPHEME_STATE_INIT;
           while (true) {
             l = utf_ptr2len(regparse);
-            if (!utf_composinglike(regparse, regparse + l)) {
+            if (!utf_composinglike(regparse, regparse + l, &state)) {
               break;
             }
             regmbc(utf_ptr2char(regparse));
@@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
             // Check for following composing character, unless %C
             // follows (skips over all composing chars).
             if (status != RA_NOMATCH
-                && utf_composinglike((char *)rex.input, (char *)rex.input + len)
+                && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL)
                 && !rex.reg_icombine
                 && OP(next) != RE_COMPOSING) {
               // raaron: This code makes a composing character get
@@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
             break;
           }
           const int opndc = utf_ptr2char((char *)opnd);
-          if (utf_iscomposing(opndc)) {
+          if (utf_iscomposing_legacy(opndc)) {
             // When only a composing char is given match at any
             // position where that composing char appears.
             status = RA_NOMATCH;
             for (i = 0; rex.input[i] != NUL;
                  i += utf_ptr2len((char *)rex.input + i)) {
               const int inpc = utf_ptr2char((char *)rex.input + i);
-              if (!utf_iscomposing(inpc)) {
+              if (!utf_iscomposing_legacy(inpc)) {
                 if (i > 0) {
                   break;
                 }
@@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
 
         case RE_COMPOSING:
           // Skip composing characters.
-          while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
+          while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) {
             rex.input += utf_ptr2len((char *)rex.input);
           }
           break;
@@ -10070,7 +10071,7 @@ static int nfa_regatom(void)
     }
     // When '.' is followed by a composing char ignore the dot, so that
     // the composing char is matched here.
-    if (c == Magic('.') && utf_iscomposing(peekchr())) {
+    if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
       old_regparse = (uint8_t *)regparse;
       c = getchr();
       goto nfa_do_multibyte;
@@ -10705,7 +10706,7 @@ collection:
 nfa_do_multibyte:
     // plen is length of current char with composing chars
     if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
-        || utf_iscomposing(c)) {
+        || utf_iscomposing_legacy(c)) {
       int i = 0;
 
       // A base character plus composing characters, or just one
@@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
     }
     if (match
         // check that no composing char follows
-        && !utf_iscomposing(utf_ptr2char((char *)s2))) {
+        && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) {
       cleanup_subexpr();
       if (REG_MULTI) {
         rex.reg_startpos[0].lnum = rex.lnum;
@@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
         // is not really a match.
         if (!rex.reg_icombine
             && rex.input != rex.line
-            && utf_iscomposing(curc)) {
+            && utf_iscomposing_legacy(curc)) {
           break;
         }
         nfa_match = true;
@@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
 
         sta = t->state->out;
         len = 0;
-        if (utf_iscomposing(sta->c)) {
+        if (utf_iscomposing_legacy(sta->c)) {
           // Only match composing character(s), ignore base
           // character.  Used for ".{composing}" and "{composing}"
           // (no preceding character).
@@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
             int j;
 
             sta = t->state->out->out;
-            if (utf_iscomposing(sta->c)) {
+            if (utf_iscomposing_legacy(sta->c)) {
               // Only match composing character(s), ignore base
               // character.  Used for ".{composing}" and "{composing}"
               // (no preceding character).
@@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
       case NFA_ANY_COMPOSING:
         // On a composing character skip over it.  Otherwise do
         // nothing.  Always matches.
-        if (utf_iscomposing(curc)) {
+        if (utf_iscomposing_legacy(curc)) {
           add_off = clen;
         } else {
           add_here = true;
diff --git a/src/nvim/search.c b/src/nvim/search.c
index 9e00664d86..ff6e135df1 100644
--- a/src/nvim/search.c
+++ b/src/nvim/search.c
@@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen
       // empty for the search_stat feature.
       if (!cmd_silent) {
         msgbuf[0] = (char)dirc;
-        if (utf_iscomposing(utf_ptr2char(p))) {
+        if (utf_iscomposing_first(utf_ptr2char(p))) {
           // Use a space to draw the composing char on.
           msgbuf[1] = ' ';
           memmove(msgbuf + 2, p, plen);
diff --git a/src/nvim/sign.c b/src/nvim/sign.c
index 9b2516ed83..b4ba7833e9 100644
--- a/src/nvim/sign.c
+++ b/src/nvim/sign.c
@@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text)
     if (!vim_isprintc(c)) {
       break;
     }
-    int width = utf_char2cells(c);
+    int width = utf_ptr2cells(s);
     if (width == 2) {
       sign_text[cells + 1] = 0;
     }
diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c
index d6053a533e..b37f01e769 100644
--- a/src/nvim/spellsuggest.c
+++ b/src/nvim/spellsuggest.c
@@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
               // For changing a composing character adjust
               // the score from SCORE_SUBST to
               // SCORE_SUBCOMP.
-              if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen
-                                               - sp->ts_tcharlen))
-                  && utf_iscomposing(utf_ptr2char(fword
-                                                  + sp->ts_fcharstart))) {
+              if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen))
+                  && utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) {
                 sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP;
               } else if (!soundfold
                          && slang->sl_has_map
@@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
                        && sp->ts_twordlen > sp->ts_tcharlen) {
               p = tword + sp->ts_twordlen - sp->ts_tcharlen;
               c = utf_ptr2char(p);
-              if (utf_iscomposing(c)) {
+              if (utf_iscomposing_legacy(c)) {
                 // Inserting a composing char doesn't
                 // count that much.
                 sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
@@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
         c = utf_ptr2char(fword + sp->ts_fidx);
         stack[depth].ts_fidx =
           (uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx));
-        if (utf_iscomposing(c)) {
+        if (utf_iscomposing_legacy(c)) {
           stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
         } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) {
           stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
diff --git a/src/nvim/textformat.c b/src/nvim/textformat.c
index 96907362dd..30c7d0ee92 100644
--- a/src/nvim/textformat.c
+++ b/src/nvim/textformat.c
@@ -47,7 +47,7 @@ static bool did_add_space = false;  ///< auto_format() added an extra space
                                     ///< under the cursor
 
 #define WHITECHAR(cc) (ascii_iswhite(cc) \
-                       && !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
+                       && !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
 
 /// Return true if format option 'x' is in effect.
 /// Take care of no formatting when 'paste' is set.
diff --git a/src/nvim/tui/tui.c b/src/nvim/tui/tui.c
index 1866a4a592..7e1068ed56 100644
--- a/src/nvim/tui/tui.c
+++ b/src/nvim/tui/tui.c
@@ -109,6 +109,7 @@ struct TUIData {
   bool set_cursor_color_as_str;
   bool cursor_color_changed;
   bool is_starting;
+  bool did_set_grapheme_cluster_mode;
   FILE *screenshot;
   cursorentry_T cursor_shapes[SHAPE_IDX_COUNT];
   HlAttrs clear_attrs;
@@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set)
 void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
   FUNC_ATTR_NONNULL_ALL
 {
+  bool is_set = false;
   switch (state) {
   case kTermModeNotRecognized:
   case kTermModePermanentlySet:
@@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
     // then there is nothing to do
     break;
   case kTermModeSet:
+    is_set = true;
+    FALLTHROUGH;
   case kTermModeReset:
     // The terminal supports changing the given mode
     switch (mode) {
@@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
       signal_watcher_stop(&tui->winch_handle);
       tui_set_term_mode(tui, mode, true);
       break;
+    case kTermModeGraphemeClusters:
+      if (!is_set) {
+        tui_set_term_mode(tui, mode, true);
+        tui->did_set_grapheme_cluster_mode = true;
+      }
+      break;
     }
   }
 }
@@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui)
   if (!nsterm) {
     tui_request_term_mode(tui, kTermModeSynchronizedOutput);
     tui_request_term_mode(tui, kTermModeResizeEvents);
+    tui_request_term_mode(tui, kTermModeGraphemeClusters);
   }
 
   // Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it.
@@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui)
 
   // Disable resize events
   tui_set_term_mode(tui, kTermModeResizeEvents, false);
-
+  if (tui->did_set_grapheme_cluster_mode) {
+    tui_set_term_mode(tui, kTermModeGraphemeClusters, false);
+  }
   // May restore old title before exiting alternate screen.
   tui_set_title(tui, NULL_STRING);
   if (ui_client_exit_status == 0) {
@@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool
   char buf[MAX_SCHAR_SIZE];
   schar_get(buf, cell->data);
   int c = utf_ptr2char(buf);
-  bool is_ambiwidth = utf_ambiguous_width(c);
+  bool is_ambiwidth = utf_ambiguous_width(buf);
   if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) {
     // If the server used setcellwidths() to treat a single-width char as double-width,
     // it needs to be treated like an ambiguous-width char.
diff --git a/src/nvim/tui/tui_defs.h b/src/nvim/tui/tui_defs.h
index 46913e07a2..bd99d6b0ad 100644
--- a/src/nvim/tui/tui_defs.h
+++ b/src/nvim/tui/tui_defs.h
@@ -4,6 +4,7 @@ typedef struct TUIData TUIData;
 
 typedef enum {
   kTermModeSynchronizedOutput = 2026,
+  kTermModeGraphemeClusters = 2027,
   kTermModeResizeEvents = 2048,
 } TermMode;
author	bfredl <bjorn.linse@gmail.com>	2024-08-08 10:42:08 +0200
committer	bfredl <bjorn.linse@gmail.com>	2024-08-30 11:49:09 +0200
commit	cfdf68a7acde16597fbd896674af68c42361102c (patch)
tree	6113193fda7a7c0f94577a464e39964e74311583 /src
parent	4353996d0fa8e5872a334d68196d8088391960cf (diff)
download	rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.gz rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.bz2 rneovim-cfdf68a7acde16597fbd896674af68c42361102c.zip