aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorbfredl <bjorn.linse@gmail.com>2024-08-30 12:58:48 +0200
committerGitHub <noreply@github.com>2024-08-30 12:58:48 +0200
commit5f95f1249f464e4f0ceed468ec5a1ba6e810da14 (patch)
tree6113193fda7a7c0f94577a464e39964e74311583 /src
parent4353996d0fa8e5872a334d68196d8088391960cf (diff)
parentcfdf68a7acde16597fbd896674af68c42361102c (diff)
downloadrneovim-5f95f1249f464e4f0ceed468ec5a1ba6e810da14.tar.gz
rneovim-5f95f1249f464e4f0ceed468ec5a1ba6e810da14.tar.bz2
rneovim-5f95f1249f464e4f0ceed468ec5a1ba6e810da14.zip
Merge pull request #30014 from bfredl/neoemoji
support emojis with ZWJ and variant selectors
Diffstat (limited to 'src')
-rw-r--r--src/nvim/api/extmark.c2
-rw-r--r--src/nvim/api/ui.c2
-rw-r--r--src/nvim/change.c7
-rw-r--r--src/nvim/digraph.c2
-rw-r--r--src/nvim/drawline.c22
-rw-r--r--src/nvim/edit.c12
-rw-r--r--src/nvim/ex_cmds.c2
-rw-r--r--src/nvim/ex_getln.c6
-rw-r--r--src/nvim/grid.c37
-rw-r--r--src/nvim/mbyte.c286
-rw-r--r--src/nvim/mbyte.h32
-rw-r--r--src/nvim/message.c4
-rw-r--r--src/nvim/normal.c5
-rw-r--r--src/nvim/options.lua9
-rw-r--r--src/nvim/plines.c28
-rw-r--r--src/nvim/plines.h2
-rw-r--r--src/nvim/regexp.c29
-rw-r--r--src/nvim/search.c2
-rw-r--r--src/nvim/sign.c2
-rw-r--r--src/nvim/spellsuggest.c10
-rw-r--r--src/nvim/textformat.c2
-rw-r--r--src/nvim/tui/tui.c17
-rw-r--r--src/nvim/tui/tui_defs.h1
23 files changed, 345 insertions, 176 deletions
diff --git a/src/nvim/api/extmark.c b/src/nvim/api/extmark.c
index 1673519479..d694b64f66 100644
--- a/src/nvim/api/extmark.c
+++ b/src/nvim/api/extmark.c
@@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer
String c = opts->conceal;
if (c.size > 0) {
int ch;
- hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch);
+ hl.conceal_char = utfc_ptr2schar(c.data, &ch);
if (!hl.conceal_char || !vim_isprintc(ch)) {
api_set_error(err, kErrorTypeValidation, "conceal char has to be printable");
goto error;
diff --git a/src/nvim/api/ui.c b/src/nvim/api/ui.c
index 82a5ff5f8e..a99d97acb8 100644
--- a/src/nvim/api/ui.c
+++ b/src/nvim/api/ui.c
@@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco
char sc_buf[MAX_SCHAR_SIZE];
schar_get(sc_buf, chunk[i]);
remote_ui_put(ui, sc_buf);
- if (utf_ambiguous_width(utf_ptr2char(sc_buf))) {
+ if (utf_ambiguous_width(sc_buf)) {
ui->client_col = -1; // force cursor update
}
}
diff --git a/src/nvim/change.c b/src/nvim/change.c
index 6e9fab5a9b..47a9f0ce92 100644
--- a/src/nvim/change.c
+++ b/src/nvim/change.c
@@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
// delete the last combining character.
if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
char *p0 = oldp + col;
- if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
+ GraphemeState state = GRAPHEME_STATE_INIT;
+ if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) {
// Find the last composing char, there can be several.
int n = col;
do {
col = n;
count = utf_ptr2len(oldp + n);
n += count;
- } while (utf_composinglike(oldp + col, oldp + n));
+ } while (utf_composinglike(oldp + col, oldp + n, &state));
fixpos = false;
}
}
@@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment)
}
if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) {
while ((*p_extra == ' ' || *p_extra == '\t')
- && !utf_iscomposing(utf_ptr2char(p_extra + 1))) {
+ && !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) {
if (REPLACE_NORMAL(State)) {
replace_push(*p_extra);
}
diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c
index 8149c5964f..7413d33fe4 100644
--- a/src/nvim/digraph.c
+++ b/src/nvim/digraph.c
@@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous)
p = buf;
// add a space to draw a composing char on
- if (utf_iscomposing(dp->result)) {
+ if (utf_iscomposing_first(dp->result)) {
*p++ = ' ';
}
p += utf_char2bytes(dp->result, p);
diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c
index 8a948716e5..b5273a54ca 100644
--- a/src/nvim/drawline.c
+++ b/src/nvim/drawline.c
@@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
// If a double-width char doesn't fit display a '>' in the last column.
// Don't advance the pointer but put the character at the start of the next line.
- if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
+ if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
mb_c = '>';
mb_l = 1;
(void)mb_l;
@@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
// If a double-width char doesn't fit display a '>' in the
// last column; the character is displayed at the start of the
// next line.
- if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
+ if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) {
mb_schar = schar_from_ascii('>');
mb_c = '>';
mb_l = 1;
@@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
|| (decor_conceal && decor_state.conceal_char)
|| wp->w_p_cole == 1)
&& wp->w_p_cole != 3) {
+ if (schar_cells(mb_schar) > 1) {
+ // When the first char to be concealed is double-width,
+ // need to advance one more virtual column.
+ wlv.n_extra++;
+ }
+
// First time at this concealed item: display one
// character.
if (has_match_conc && match_conc) {
@@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
mb_schar = schar_from_ascii(' ');
}
- if (utf_char2cells(mb_c) > 1) {
- // When the first char to be concealed is double-width,
- // need to advance one more virtual column.
- wlv.n_extra++;
- }
-
mb_c = schar_get_first_codepoint(mb_schar);
prev_syntax_id = syntax_seqnr;
@@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
&& mb_schar != NUL) {
mb_schar = wp->w_p_lcs_chars.prec;
lcs_prec_todo = NUL;
- if (utf_char2cells(mb_c) > 1) {
+ if (schar_cells(mb_schar) > 1) {
// Double-width character being overwritten by the "precedes"
// character, need to fill up half the character.
wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR);
@@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
linebuf_vcol[wlv.off] = wlv.vcol;
- if (utf_char2cells(mb_c) > 1) {
+ if (schar_cells(mb_schar) > 1) {
// Need to fill two screen columns.
wlv.off++;
wlv.col++;
@@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s
wlv.off++;
wlv.col++;
} else if (wp->w_p_cole > 0 && is_concealing) {
- bool concealed_wide = utf_char2cells(mb_c) > 1;
+ bool concealed_wide = schar_cells(mb_schar) > 1;
wlv.skip_cells--;
wlv.vcol_off_co++;
diff --git a/src/nvim/edit.c b/src/nvim/edit.c
index 00ce38c4b1..f8723f9680 100644
--- a/src/nvim/edit.c
+++ b/src/nvim/edit.c
@@ -2832,6 +2832,8 @@ int replace_push_mb(char *p)
{
int l = utfc_ptr2len(p);
+ // TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping.
+ // or just keep a secondary array with char byte lenghts
for (int j = l - 1; j >= 0; j--) {
replace_push(p[j]);
}
@@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc)
for (int i = 1; i < n; i++) {
buf[i] = (uint8_t)replace_pop();
}
- if (utf_iscomposing(utf_ptr2char((char *)buf))) {
+ // TODO(bfredl): by fixing replace_push_mb, upgrade to use
+ // the new composing algorithm
+ if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) {
ins_bytes_len((char *)buf, (size_t)n);
} else {
// Not a composing char, put it back.
@@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
space_sci = sci;
space_vcol = vcol;
}
- vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value);
+ vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value);
sci = utfc_next(sci);
prev_space = cur_space;
}
@@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
// Find the position to stop backspacing.
// Use charsize_nowrap() so that virtual text and wrapping are ignored.
while (true) {
- int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value);
+ int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value);
if (space_vcol + size > want_vcol) {
break;
}
@@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
bool has_composing = false;
if (p_deco) {
char *p0 = get_cursor_pos_ptr();
- has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
+ has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL);
}
del_char(false);
// If there are combining characters and 'delcombine' is set
diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c
index 6ac73527ee..1b6861f750 100644
--- a/src/nvim/ex_cmds.c
+++ b/src/nvim/ex_cmds.c
@@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap)
IObuff[iobuff_len++] = ' ';
}
IObuff[iobuff_len++] = '<';
- if (utf_iscomposing(c)) {
+ if (utf_iscomposing_first(c)) {
IObuff[iobuff_len++] = ' '; // Draw composing char on top of a space.
}
iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c
index 8a34e03d91..722a857f03 100644
--- a/src/nvim/ex_getln.c
+++ b/src/nvim/ex_getln.c
@@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s)
s->do_abbr = false; // don't do abbreviation now
ccline.special_char = NUL;
// may need to remove ^ when composing char was typed
- if (utf_iscomposing(s->c) && !cmd_silent) {
+ if (utf_iscomposing_first(s->c) && !cmd_silent) {
if (ui_has(kUICmdline)) {
// TODO(bfredl): why not make unputcmdline also work with true?
unputcmdline();
@@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw)
// backup to the character before it. There could be two of them.
int i = 0;
int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos);
- while (ccline.cmdpos > 0 && utf_iscomposing(c)) {
+ // TODO(bfredl): this can be corrected/simplified as utf_head_off implements the
+ // correct grapheme cluster breaks
+ while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) {
i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1;
ccline.cmdpos -= i;
len += i;
diff --git a/src/nvim/grid.c b/src/nvim/grid.c
index 56246bf001..acb336c725 100644
--- a/src/nvim/grid.c
+++ b/src/nvim/grid.c
@@ -186,6 +186,24 @@ size_t schar_len(schar_T sc)
}
}
+int schar_cells(schar_T sc)
+{
+ // hot path
+#ifdef ORDER_BIG_ENDIAN
+ if (!(sc & 0x80FFFFFF)) {
+ return 1;
+ }
+#else
+ if (sc < 0x80) {
+ return 1;
+ }
+#endif
+
+ char sc_buf[MAX_SCHAR_SIZE];
+ schar_get(sc_buf, sc);
+ return utf_ptr2cells(sc_buf);
+}
+
/// gets first raw UTF-8 byte of an schar
static char schar_get_first_byte(schar_T sc)
{
@@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
const int max_col = grid_line_maxcol;
while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
// check if this is the first byte of a multibyte
- int mbyte_blen = len > 0
- ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
- : utfc_ptr2len(ptr);
+ int mbyte_blen;
+ if (len >= 0) {
+ int maxlen = (int)((text + len) - ptr);
+ mbyte_blen = utfc_ptr2len_len(ptr, maxlen);
+ if (mbyte_blen > maxlen) {
+ mbyte_blen = 1;
+ }
+ } else {
+ mbyte_blen = utfc_ptr2len(ptr);
+ }
int firstc;
- schar_T schar = len >= 0
- ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
- : utfc_ptr2schar(ptr, &firstc);
- int mbyte_cells = utf_char2cells(firstc);
+ schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc);
+ int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen);
if (mbyte_cells > 2 || schar == 0) {
mbyte_cells = 1;
schar = schar_from_char(0xFFFD);
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 0c1b537f3a..666a904fc5 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -511,20 +511,30 @@ int utf_char2cells(int c)
/// Return the number of display cells character at "*p" occupies.
/// This doesn't take care of unprintable characters, use ptr2cells() for that.
-int utf_ptr2cells(const char *p)
+int utf_ptr2cells(const char *p_in)
{
+ const uint8_t *p = (const uint8_t *)p_in;
// Need to convert to a character number.
- if ((uint8_t)(*p) >= 0x80) {
- int c = utf_ptr2char(p);
+ if ((*p) >= 0x80) {
+ int len = utf8len_tab[*p];
+ int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
// An illegal byte is displayed as <xx>.
- if (utf_ptr2len(p) == 1 || c == NUL) {
+ if (c <= 0) {
return 4;
}
// If the char is ASCII it must be an overlong sequence.
if (c < 0x80) {
return char2cells(c);
}
- return utf_char2cells(c);
+ int cells = utf_char2cells(c);
+ if (cells == 1 && p_emoji
+ && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+ int c2 = utf_ptr2char(p_in + len);
+ if (c2 == 0xFE0F) {
+ return 2; // emoji presentation
+ }
+ }
+ return cells;
}
return 1;
}
@@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size)
{
// Need to convert to a wide character.
if (size > 0 && (uint8_t)(*p) >= 0x80) {
- if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
+ int len = utf_ptr2len_len(p, size);
+ if (len < utf8len_tab[(uint8_t)(*p)]) {
return 1; // truncated
}
int c = utf_ptr2char(p);
@@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size)
if (c < 0x80) {
return char2cells(c);
}
- return utf_char2cells(c);
+ int cells = utf_char2cells(c);
+ if (cells == 1 && p_emoji && size > len
+ && intable(emoji_all, ARRAY_SIZE(emoji_all), c)
+ && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
+ int c2 = utf_ptr2char(p + len);
+ if (c2 == 0xFE0F) {
+ return 2; // emoji presentation
+ }
+ }
+ return cells;
}
return 1;
}
@@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size)
size_t clen = 0;
for (const char *p = str; *p != NUL && p < str + size;
- p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) {
- clen += (size_t)utf_ptr2cells(p);
+ p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) {
+ clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str));
}
return clen;
@@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp)
return c;
}
+/// When "c" is the first char of a string, determine if it needs to be prefixed
+/// by a space byte to be drawn correctly, and not merge with the space left of
+/// the string.
+bool utf_iscomposing_first(int c)
+{
+ return c >= 128 && !utf8proc_grapheme_break(' ', c);
+}
+
/// Check if the character pointed to by "p2" is a composing character when it
-/// comes after "p1". For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-bool utf_composinglike(const char *p1, const char *p2)
+/// comes after "p1".
+///
+/// We use the definition in UAX#29 as implemented by utf8proc with the following
+/// exceptions:
+///
+/// - ASCII chars always begin a new cluster. This is a long assumed invariant
+/// in the code base and very useful for performance (we can exit early for ASCII
+/// all over the place, branch predictor go brrr in ASCII-only text).
+/// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII,
+/// which should be exceedingly rare (these PREPEND chars are expected to be
+/// followed by multibyte chars within the same script family)
+///
+/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with
+/// "c" taking one single cell, which behaves like a cluster.
+///
+/// @param "state" should be set to GRAPHEME_STATE_INIT before first call
+/// it is allowed to be null, but will then not handle some longer
+/// sequences, like ZWJ based emoji
+bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
+ FUNC_ATTR_NONNULL_ARG(1, 2)
{
- int c2 = utf_ptr2char(p2);
- if (utf_iscomposing(c2)) {
- return true;
- }
- if (!arabic_maycombine(c2)) {
+ if ((uint8_t)(*p2) < 128) {
return false;
}
- return arabic_combine(utf_ptr2char(p1), c2);
-}
-/// Check if the next character is a composing character when it
-/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-/// returns false for negative values
-bool utf_char_composinglike(int32_t const first, int32_t const next)
- FUNC_ATTR_PURE
-{
- return utf_iscomposing(next) || arabic_combine(first, next);
+ int first = utf_ptr2char(p1);
+ int second = utf_ptr2char(p2);
+
+ if (!utf8proc_grapheme_break_stateful(first, second, state)) {
+ return true;
+ }
+
+ return arabic_combine(first, second);
}
/// Get the screen char at the beginning of a string
@@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
{
int c = utf_ptr2char(p);
*firstc = c; // NOT optional, you are gonna need it
- bool first_compose = utf_iscomposing(c);
+ bool first_compose = utf_iscomposing_first(c);
size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
@@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
return schar_from_buf_first(p, len, first_compose);
}
-/// Get the screen char at the beginning of a string with length
+/// Get the screen char from a char with a known length
///
/// Like utfc_ptr2schar but use no more than p[maxlen].
-schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc)
FUNC_ATTR_NONNULL_ALL
{
- assert(maxlen > 0);
-
- size_t len = (size_t)utf_ptr2len_len(p, maxlen);
- if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+ if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
// invalid or truncated sequence
*firstc = (uint8_t)(*p);
return 0;
@@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
int c = utf_ptr2char(p);
*firstc = c;
- bool first_compose = utf_iscomposing(c);
- maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
- len = (size_t)utfc_ptr2len_len(p, maxlen);
+ bool first_compose = utf_iscomposing_first(c);
+ int maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+ if (len > maxlen) {
+ len = utfc_ptr2len_len(p, maxlen);
+ }
- return schar_from_buf_first(p, len, first_compose);
+ return schar_from_buf_first(p, (size_t)len, first_compose);
}
/// Caller must ensure there is space for `first_compose`
@@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p)
// Check for composing characters.
int prevlen = 0;
+ GraphemeState state = GRAPHEME_STATE_INIT;
while (true) {
- if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
+ if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) {
return len;
}
@@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size)
return 1;
}
- // Check for composing characters. We can handle only the first six, but
+ // Check for composing characters. We can only display a limited amount, but
// skip all of them (otherwise the cursor would get stuck).
int prevlen = 0;
+ GraphemeState state = GRAPHEME_STATE_INIT;
while (len < size) {
if ((uint8_t)p[len] < 0x80) {
break;
@@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size)
break;
}
- if (!utf_composinglike(p + prevlen, p + len)) {
+ if (!utf_composinglike(p + prevlen, p + len, &state)) {
break;
}
@@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf)
}
}
-/// Return true if "c" is a composing UTF-8 character.
-/// This means it will be drawn on top of the preceding character.
+/// Return true if "c" is a legacy composing UTF-8 character.
+///
+/// This is deprecated in favour of utf_composinglike() which uses the modern
+/// stateful algorithm to determine grapheme clusters. Still available
+/// to support some legacy code which hasn't been refactored yet.
+///
+/// To check if a char would combine with a preceeding space, use
+/// utf_iscomposing_first() instead.
+///
/// Based on code from Markus Kuhn.
/// Returns false for negative values.
-bool utf_iscomposing(int c)
+bool utf_iscomposing_legacy(int c)
{
return intable(combining, ARRAY_SIZE(combining), c);
}
@@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 2;
}
-bool utf_ambiguous_width(int c)
+bool utf_ambiguous_width(const char *p)
{
+ int c = utf_ptr2char(p);
return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
|| intable(emoji_all, ARRAY_SIZE(emoji_all), c));
}
@@ -1666,6 +1714,26 @@ void show_utf8(void)
msg(IObuff, 0);
}
+/// @return true if boundclass bc always starts a new cluster regardless of what's before
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break(int bc)
+{
+ return (bc == UTF8PROC_BOUNDCLASS_CONTROL);
+}
+
+/// @return true if bc2 always starts a cluster after bc1
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break_two(int bc1, int bc2)
+{
+ // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by
+ // "always_break" on first iteration or when it was bc1 in the previous iteration
+ return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER)
+ || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL)
+ || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+ && (bc1 == UTF8PROC_BOUNDCLASS_OTHER
+ || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC)));
+}
+
/// Return offset from "p" to the start of a character, including composing characters.
/// "base" must be the start of the string, which must be NUL terminated.
/// If "p" points to the NUL at the end of the string return 0.
@@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in)
const uint8_t *base = (uint8_t *)base_in;
const uint8_t *p = (uint8_t *)p_in;
- // Skip backwards over trailing bytes: 10xx.xxxx
- // Skip backwards again if on a composing char.
- const uint8_t *q;
- for (q = p;; q--) {
- // Move s to the last byte of this char.
- const uint8_t *s;
- for (s = q; (s[1] & 0xc0) == 0x80; s++) {}
-
- // Move q to the first byte of this char.
- while (q > base && (*q & 0xc0) == 0x80) {
- q--;
- }
- // Check for illegal sequence. Do allow an illegal byte after where we
- // started.
- int len = utf8len_tab[*q];
- if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
- return 0;
+ const uint8_t *start = p;
+
+ // move start to the first byte of this codepoint
+ // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl
+ while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) {
+ start--;
+ }
+
+ uint8_t cur_len = utf8len_tab[*start];
+ int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len);
+ if (cur_code < 0) {
+ return 0; // p must be part of an illegal sequence
+ }
+ const uint8_t * const safe_end = start + cur_len;
+
+ int cur_bc = utf8proc_get_property(cur_code)->boundclass;
+ if (always_break(cur_bc)) {
+ return (int)(p - start);
+ }
+
+ // backtrack to find the start of a cluster. we might go too far, checked in the next loop
+ const uint8_t *cur_pos = start;
+ const uint8_t *const p_start = start;
+
+ if (start == base) {
+ return (int)(p - start);
+ }
+
+ start--;
+ while (*start >= 0x80) { // stop on ascii, we are done
+ while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
+ start--;
}
- if (q <= base) {
+ int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]);
+ if (prev_code < 0) {
+ start = cur_pos; // start at valid sequence after invalid bytes
break;
}
- int c = utf_ptr2char((char *)q);
- if (utf_iscomposing(c)) {
- continue;
+ int prev_bc = utf8proc_get_property(prev_code)->boundclass;
+ if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) {
+ start = cur_pos; // prev_code cannot be a part of this cluster
+ break;
+ } else if (start == base) {
+ break;
}
+ cur_pos = start;
+ cur_bc = prev_bc;
+ cur_code = prev_code;
- if (arabic_maycombine(c)) {
- // Advance to get a sneak-peak at the next char
- const uint8_t *j = q;
- j--;
- // Move j to the first byte of this char.
- while (j > base && (*j & 0xc0) == 0x80) {
- j--;
- }
- if (arabic_combine(utf_ptr2char((char *)j), c)) {
- continue;
- }
+ start--;
+ }
+
+ // hot path: we are already on the first codepoint of a sequence
+ if (start == p_start) {
+ return (int)(p - start);
+ }
+
+ const uint8_t *q = start;
+ while (q < p) {
+ // don't need to find end of cluster. once we reached the codepoint of p, we are done
+ int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q));
+
+ if (q + len > p) {
+ return (int)(p - q);
}
- break;
+
+ q += len;
}
- return (int)(p - q);
+ return 0;
+}
+
+/// Assumes caller already handles ascii. see `utfc_next`
+StrCharInfo utfc_next_impl(StrCharInfo cur)
+{
+ int32_t prev_code = cur.chr.value;
+ uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
+ GraphemeState state = GRAPHEME_STATE_INIT;
+ assert(*next >= 0x80);
+
+ while (true) {
+ uint8_t const next_len = utf8len_tab[*next];
+ int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
+ if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
+ && !arabic_combine(prev_code, next_code)) {
+ return (StrCharInfo){
+ .ptr = (char *)next,
+ .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
+ };
+ }
+
+ prev_code = next_code;
+ next += next_len;
+ if (EXPECT(*next < 0x80U, true)) {
+ return (StrCharInfo){
+ .ptr = (char *)next,
+ .chr = (CharInfo){ .value = *next, .len = 1 },
+ };
+ }
+ }
}
// Whether space is NOT allowed before/after 'c'.
@@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
c = 0x100; break; // not in latin9
}
}
- if (!utf_iscomposing(c)) { // skip composing chars
+ if (!utf_iscomposing_legacy(c)) { // skip composing chars
if (c < 0x100) {
*d++ = (uint8_t)c;
} else if (vcp->vc_fail) {
diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h
index 6cbfbcbc3c..2da051fca2 100644
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@@ -3,6 +3,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <sys/types.h> // IWYU pragma: keep
+#include <utf8proc.h>
#include <uv.h> // IWYU pragma: keep
#include "nvim/cmdexpand_defs.h" // IWYU pragma: keep
@@ -11,6 +12,9 @@
#include "nvim/mbyte_defs.h" // IWYU pragma: keep
#include "nvim/types_defs.h" // IWYU pragma: keep
+typedef utf8proc_int32_t GraphemeState;
+#define GRAPHEME_STATE_INIT 0
+
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.h.generated.h"
# include "mbyte.h.inline.generated.h"
@@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
static inline StrCharInfo utfc_next(StrCharInfo cur)
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
{
- int32_t prev_code = cur.chr.value;
+ // handle ASCII case inline
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
-
- while (true) {
- if (EXPECT(*next < 0x80U, true)) {
- return (StrCharInfo){
- .ptr = (char *)next,
- .chr = (CharInfo){ .value = *next, .len = 1 },
- };
- }
- uint8_t const next_len = utf8len_tab[*next];
- int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
- if (!utf_char_composinglike(prev_code, next_code)) {
- return (StrCharInfo){
- .ptr = (char *)next,
- .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
- };
- }
-
- prev_code = next_code;
- next += next_len;
+ if (EXPECT(*next < 0x80U, true)) {
+ return (StrCharInfo){
+ .ptr = (char *)next,
+ .chr = (CharInfo){ .value = *next, .len = 1 },
+ };
}
+
+ return utfc_next_impl(cur);
}
static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)
diff --git a/src/nvim/message.c b/src/nvim/message.c
index 53e5511a5a..79e6bc8be7 100644
--- a/src/nvim/message.c
+++ b/src/nvim/message.c
@@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen)
// Last part: End of the string.
half = i = (int)strlen(s);
while (true) {
- do {
- half = half - utf_head_off(s, s + half - 1) - 1;
- } while (half > 0 && utf_iscomposing(utf_ptr2char(s + half)));
+ half = half - utf_head_off(s, s + half - 1) - 1;
n = ptr2cells(s + half);
if (len + n > room || half == 0) {
break;
diff --git a/src/nvim/normal.c b/src/nvim/normal.c
index f44a64af21..f3bdea9a85 100644
--- a/src/nvim/normal.c
+++ b/src/nvim/normal.c
@@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s)
while ((s->c = vpeekc()) > 0
&& (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) {
s->c = plain_vgetc();
- if (!utf_iscomposing(s->c)) {
+ // TODO(bfredl): only allowing up to two composing chars is cringe af.
+ // Could reuse/abuse schar_T to at least allow us to input anything we are able
+ // to display and use the stateful utf8proc algorithm like utf_composinglike
+ if (!utf_iscomposing_legacy(s->c)) {
vungetc(s->c); // it wasn't, put it back
break;
} else if (s->ca.ncharC1 == 0) {
diff --git a/src/nvim/options.lua b/src/nvim/options.lua
index 3612a80fb8..1c17b0fc9f 100644
--- a/src/nvim/options.lua
+++ b/src/nvim/options.lua
@@ -2326,9 +2326,12 @@ return {
desc = [=[
When on all Unicode emoji characters are considered to be full width.
This excludes "text emoji" characters, which are normally displayed as
- single width. Unfortunately there is no good specification for this
- and it has been determined on trial-and-error basis. Use the
- |setcellwidths()| function to change the behavior.
+ single width. However, such "text emoji" are treated as full-width
+ emoji if they are followed by the U+FE0F variant selector.
+
+ Unfortunately there is no good specification for this and it has been
+ determined on trial-and-error basis. Use the |setcellwidths()|
+ function to change the behavior.
]=],
full_name = 'emoji',
redraw = { 'all_windows', 'ui_option' },
diff --git a/src/nvim/plines.c b/src/nvim/plines.c
index e51e9bf8c3..408fe26bf3 100644
--- a/src/nvim/plines.c
+++ b/src/nvim/plines.c
@@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
} else if (cur_char < 0) {
size = kInvalidByteCells;
} else {
- size = char2cells(cur_char);
+ size = ptr2cells(cur);
is_doublewidth = size == 2 && cur_char > 0x80;
}
@@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco
///
/// @see charsize_regular
/// @see charsize_fast
-static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol,
- int32_t const cur_char)
+static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop,
+ colnr_T const vcol, int32_t const cur_char)
FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE
{
// A tab gets expanded, depending on the current column
@@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
if (cur_char < 0) {
width = kInvalidByteCells;
} else {
- width = char2cells(cur_char);
+ // TODO(bfredl): perf: often cur_char is enough at this point to determine width.
+ // we likely want a specialized version of utf_ptr2StrCharInfo also determining
+ // the ptr2cells width at the same time without any extra decoding. (also applies
+ // to charsize_regular and charsize_nowrap)
+ width = ptr2cells(cur);
}
// If a double-width char doesn't fit at the end of a line, it wraps to the next line,
@@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col
/// Can be used if CSType is kCharsizeFast.
///
/// @see charsize_regular
-CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char)
+CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char)
FUNC_ATTR_PURE
{
- return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char);
+ return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char);
}
/// Get the number of cells taken up on the screen at given virtual column.
///
/// @see win_chartabsize()
-int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char)
+int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char)
{
if (cur_char == TAB && use_tabstop) {
return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array);
} else if (cur_char < 0) {
return kInvalidByteCells;
} else {
- return char2cells(cur_char);
+ return ptr2cells(cur);
}
}
@@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le
StrCharInfo ci = utf_ptr2StrCharInfo(line);
while (ci.ptr - line < len && *ci.ptr != NUL) {
- vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width;
+ vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width;
ci = utfc_next(ci);
if (vcol > MAXCOL) {
vcol_arg = MAXCOL;
@@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en
char_size = (CharSize){ .width = 1 };
break;
}
- char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value);
+ char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value);
StrCharInfo const next = utfc_next(ci);
if (next.ptr - line > end_col) {
break;
@@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e
if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) {
int c = utf_ptr2char(ptr + pos->col);
if ((c != TAB) && vim_isprintc(c)) {
- endadd = (colnr_T)(char2cells(c) - 1);
+ endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1);
if (coladd > endadd) {
// past end of line
endadd = 0;
@@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column)
if (cstype == kCharsizeFast) {
bool const use_tabstop = csarg.use_tabstop;
while (*ci.ptr != NUL && --column >= 0) {
- vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width;
+ vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width;
ci = utfc_next(ci);
}
} else {
diff --git a/src/nvim/plines.h b/src/nvim/plines.h
index 7128e37237..50310b8ce1 100644
--- a/src/nvim/plines.h
+++ b/src/nvim/plines.h
@@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
{
if (cstype == kCharsizeFast) {
- return charsize_fast(csarg, vcol, chr);
+ return charsize_fast(csarg, ptr, vcol, chr);
} else {
return charsize_regular(csarg, ptr, vcol, chr);
}
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index 7dbbb19545..c91c112c3c 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c)
{
return utf_char2len(c) > 1
&& (re_multi_type(peekchr()) != NOT_MULTI
- || utf_iscomposing(c));
+ || utf_iscomposing_legacy(c));
}
// Emit (if appropriate) a byte of code
@@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp)
}
// When '.' is followed by a composing char ignore the dot, so that
// the composing char is matched here.
- if (c == Magic('.') && utf_iscomposing(peekchr())) {
+ if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
c = getchr();
goto do_multibyte;
}
@@ -5001,9 +5001,10 @@ do_multibyte:
int l;
// Need to get composing character too.
+ GraphemeState state = GRAPHEME_STATE_INIT;
while (true) {
l = utf_ptr2len(regparse);
- if (!utf_composinglike(regparse, regparse + l)) {
+ if (!utf_composinglike(regparse, regparse + l, &state)) {
break;
}
regmbc(utf_ptr2char(regparse));
@@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
// Check for following composing character, unless %C
// follows (skips over all composing chars).
if (status != RA_NOMATCH
- && utf_composinglike((char *)rex.input, (char *)rex.input + len)
+ && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL)
&& !rex.reg_icombine
&& OP(next) != RE_COMPOSING) {
// raaron: This code makes a composing character get
@@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
break;
}
const int opndc = utf_ptr2char((char *)opnd);
- if (utf_iscomposing(opndc)) {
+ if (utf_iscomposing_legacy(opndc)) {
// When only a composing char is given match at any
// position where that composing char appears.
status = RA_NOMATCH;
for (i = 0; rex.input[i] != NUL;
i += utf_ptr2len((char *)rex.input + i)) {
const int inpc = utf_ptr2char((char *)rex.input + i);
- if (!utf_iscomposing(inpc)) {
+ if (!utf_iscomposing_legacy(inpc)) {
if (i > 0) {
break;
}
@@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
case RE_COMPOSING:
// Skip composing characters.
- while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
+ while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) {
rex.input += utf_ptr2len((char *)rex.input);
}
break;
@@ -10070,7 +10071,7 @@ static int nfa_regatom(void)
}
// When '.' is followed by a composing char ignore the dot, so that
// the composing char is matched here.
- if (c == Magic('.') && utf_iscomposing(peekchr())) {
+ if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
old_regparse = (uint8_t *)regparse;
c = getchr();
goto nfa_do_multibyte;
@@ -10705,7 +10706,7 @@ collection:
nfa_do_multibyte:
// plen is length of current char with composing chars
if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
- || utf_iscomposing(c)) {
+ || utf_iscomposing_legacy(c)) {
int i = 0;
// A base character plus composing characters, or just one
@@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
}
if (match
// check that no composing char follows
- && !utf_iscomposing(utf_ptr2char((char *)s2))) {
+ && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) {
cleanup_subexpr();
if (REG_MULTI) {
rex.reg_startpos[0].lnum = rex.lnum;
@@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
// is not really a match.
if (!rex.reg_icombine
&& rex.input != rex.line
- && utf_iscomposing(curc)) {
+ && utf_iscomposing_legacy(curc)) {
break;
}
nfa_match = true;
@@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
sta = t->state->out;
len = 0;
- if (utf_iscomposing(sta->c)) {
+ if (utf_iscomposing_legacy(sta->c)) {
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
@@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
int j;
sta = t->state->out->out;
- if (utf_iscomposing(sta->c)) {
+ if (utf_iscomposing_legacy(sta->c)) {
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
@@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
case NFA_ANY_COMPOSING:
// On a composing character skip over it. Otherwise do
// nothing. Always matches.
- if (utf_iscomposing(curc)) {
+ if (utf_iscomposing_legacy(curc)) {
add_off = clen;
} else {
add_here = true;
diff --git a/src/nvim/search.c b/src/nvim/search.c
index 9e00664d86..ff6e135df1 100644
--- a/src/nvim/search.c
+++ b/src/nvim/search.c
@@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen
// empty for the search_stat feature.
if (!cmd_silent) {
msgbuf[0] = (char)dirc;
- if (utf_iscomposing(utf_ptr2char(p))) {
+ if (utf_iscomposing_first(utf_ptr2char(p))) {
// Use a space to draw the composing char on.
msgbuf[1] = ' ';
memmove(msgbuf + 2, p, plen);
diff --git a/src/nvim/sign.c b/src/nvim/sign.c
index 9b2516ed83..b4ba7833e9 100644
--- a/src/nvim/sign.c
+++ b/src/nvim/sign.c
@@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text)
if (!vim_isprintc(c)) {
break;
}
- int width = utf_char2cells(c);
+ int width = utf_ptr2cells(s);
if (width == 2) {
sign_text[cells + 1] = 0;
}
diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c
index d6053a533e..b37f01e769 100644
--- a/src/nvim/spellsuggest.c
+++ b/src/nvim/spellsuggest.c
@@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
// For changing a composing character adjust
// the score from SCORE_SUBST to
// SCORE_SUBCOMP.
- if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen
- - sp->ts_tcharlen))
- && utf_iscomposing(utf_ptr2char(fword
- + sp->ts_fcharstart))) {
+ if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen))
+ && utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) {
sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP;
} else if (!soundfold
&& slang->sl_has_map
@@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
&& sp->ts_twordlen > sp->ts_tcharlen) {
p = tword + sp->ts_twordlen - sp->ts_tcharlen;
c = utf_ptr2char(p);
- if (utf_iscomposing(c)) {
+ if (utf_iscomposing_legacy(c)) {
// Inserting a composing char doesn't
// count that much.
sp->ts_score -= SCORE_INS - SCORE_INSCOMP;
@@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun
c = utf_ptr2char(fword + sp->ts_fidx);
stack[depth].ts_fidx =
(uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx));
- if (utf_iscomposing(c)) {
+ if (utf_iscomposing_legacy(c)) {
stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP;
} else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) {
stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP;
diff --git a/src/nvim/textformat.c b/src/nvim/textformat.c
index 96907362dd..30c7d0ee92 100644
--- a/src/nvim/textformat.c
+++ b/src/nvim/textformat.c
@@ -47,7 +47,7 @@ static bool did_add_space = false; ///< auto_format() added an extra space
///< under the cursor
#define WHITECHAR(cc) (ascii_iswhite(cc) \
- && !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
+ && !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1)))
/// Return true if format option 'x' is in effect.
/// Take care of no formatting when 'paste' is set.
diff --git a/src/nvim/tui/tui.c b/src/nvim/tui/tui.c
index 1866a4a592..7e1068ed56 100644
--- a/src/nvim/tui/tui.c
+++ b/src/nvim/tui/tui.c
@@ -109,6 +109,7 @@ struct TUIData {
bool set_cursor_color_as_str;
bool cursor_color_changed;
bool is_starting;
+ bool did_set_grapheme_cluster_mode;
FILE *screenshot;
cursorentry_T cursor_shapes[SHAPE_IDX_COUNT];
HlAttrs clear_attrs;
@@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set)
void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
FUNC_ATTR_NONNULL_ALL
{
+ bool is_set = false;
switch (state) {
case kTermModeNotRecognized:
case kTermModePermanentlySet:
@@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
// then there is nothing to do
break;
case kTermModeSet:
+ is_set = true;
+ FALLTHROUGH;
case kTermModeReset:
// The terminal supports changing the given mode
switch (mode) {
@@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state)
signal_watcher_stop(&tui->winch_handle);
tui_set_term_mode(tui, mode, true);
break;
+ case kTermModeGraphemeClusters:
+ if (!is_set) {
+ tui_set_term_mode(tui, mode, true);
+ tui->did_set_grapheme_cluster_mode = true;
+ }
+ break;
}
}
}
@@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui)
if (!nsterm) {
tui_request_term_mode(tui, kTermModeSynchronizedOutput);
tui_request_term_mode(tui, kTermModeResizeEvents);
+ tui_request_term_mode(tui, kTermModeGraphemeClusters);
}
// Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it.
@@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui)
// Disable resize events
tui_set_term_mode(tui, kTermModeResizeEvents, false);
-
+ if (tui->did_set_grapheme_cluster_mode) {
+ tui_set_term_mode(tui, kTermModeGraphemeClusters, false);
+ }
// May restore old title before exiting alternate screen.
tui_set_title(tui, NULL_STRING);
if (ui_client_exit_status == 0) {
@@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool
char buf[MAX_SCHAR_SIZE];
schar_get(buf, cell->data);
int c = utf_ptr2char(buf);
- bool is_ambiwidth = utf_ambiguous_width(c);
+ bool is_ambiwidth = utf_ambiguous_width(buf);
if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) {
// If the server used setcellwidths() to treat a single-width char as double-width,
// it needs to be treated like an ambiguous-width char.
diff --git a/src/nvim/tui/tui_defs.h b/src/nvim/tui/tui_defs.h
index 46913e07a2..bd99d6b0ad 100644
--- a/src/nvim/tui/tui_defs.h
+++ b/src/nvim/tui/tui_defs.h
@@ -4,6 +4,7 @@ typedef struct TUIData TUIData;
typedef enum {
kTermModeSynchronizedOutput = 2026,
+ kTermModeGraphemeClusters = 2027,
kTermModeResizeEvents = 2048,
} TermMode;