diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/nvim/api/extmark.c | 2 | ||||
-rw-r--r-- | src/nvim/api/ui.c | 2 | ||||
-rw-r--r-- | src/nvim/change.c | 7 | ||||
-rw-r--r-- | src/nvim/digraph.c | 2 | ||||
-rw-r--r-- | src/nvim/drawline.c | 22 | ||||
-rw-r--r-- | src/nvim/edit.c | 12 | ||||
-rw-r--r-- | src/nvim/ex_cmds.c | 2 | ||||
-rw-r--r-- | src/nvim/ex_getln.c | 6 | ||||
-rw-r--r-- | src/nvim/grid.c | 37 | ||||
-rw-r--r-- | src/nvim/mbyte.c | 286 | ||||
-rw-r--r-- | src/nvim/mbyte.h | 32 | ||||
-rw-r--r-- | src/nvim/message.c | 4 | ||||
-rw-r--r-- | src/nvim/normal.c | 5 | ||||
-rw-r--r-- | src/nvim/options.lua | 9 | ||||
-rw-r--r-- | src/nvim/plines.c | 28 | ||||
-rw-r--r-- | src/nvim/plines.h | 2 | ||||
-rw-r--r-- | src/nvim/regexp.c | 29 | ||||
-rw-r--r-- | src/nvim/search.c | 2 | ||||
-rw-r--r-- | src/nvim/sign.c | 2 | ||||
-rw-r--r-- | src/nvim/spellsuggest.c | 10 | ||||
-rw-r--r-- | src/nvim/textformat.c | 2 | ||||
-rw-r--r-- | src/nvim/tui/tui.c | 17 | ||||
-rw-r--r-- | src/nvim/tui/tui_defs.h | 1 |
23 files changed, 345 insertions, 176 deletions
diff --git a/src/nvim/api/extmark.c b/src/nvim/api/extmark.c index 1673519479..d694b64f66 100644 --- a/src/nvim/api/extmark.c +++ b/src/nvim/api/extmark.c @@ -571,7 +571,7 @@ Integer nvim_buf_set_extmark(Buffer buffer, Integer ns_id, Integer line, Integer String c = opts->conceal; if (c.size > 0) { int ch; - hl.conceal_char = utfc_ptr2schar_len(c.data, (int)c.size, &ch); + hl.conceal_char = utfc_ptr2schar(c.data, &ch); if (!hl.conceal_char || !vim_isprintc(ch)) { api_set_error(err, kErrorTypeValidation, "conceal char has to be printable"); goto error; diff --git a/src/nvim/api/ui.c b/src/nvim/api/ui.c index 82a5ff5f8e..a99d97acb8 100644 --- a/src/nvim/api/ui.c +++ b/src/nvim/api/ui.c @@ -847,7 +847,7 @@ void remote_ui_raw_line(RemoteUI *ui, Integer grid, Integer row, Integer startco char sc_buf[MAX_SCHAR_SIZE]; schar_get(sc_buf, chunk[i]); remote_ui_put(ui, sc_buf); - if (utf_ambiguous_width(utf_ptr2char(sc_buf))) { + if (utf_ambiguous_width(sc_buf)) { ui->client_col = -1; // force cursor update } } diff --git a/src/nvim/change.c b/src/nvim/change.c index 6e9fab5a9b..47a9f0ce92 100644 --- a/src/nvim/change.c +++ b/src/nvim/change.c @@ -896,14 +896,15 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine) // delete the last combining character. if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) { char *p0 = oldp + col; - if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) { + GraphemeState state = GRAPHEME_STATE_INIT; + if (utf_composinglike(p0, p0 + utf_ptr2len(p0), &state)) { // Find the last composing char, there can be several. int n = col; do { col = n; count = utf_ptr2len(oldp + n); n += count; - } while (utf_composinglike(oldp + col, oldp + n)); + } while (utf_composinglike(oldp + col, oldp + n, &state)); fixpos = false; } } @@ -1694,7 +1695,7 @@ bool open_line(int dir, int flags, int second_line_indent, bool *did_do_comment) } if (curbuf->b_p_ai || (flags & OPENLINE_DELSPACES)) { while ((*p_extra == ' ' || *p_extra == '\t') - && !utf_iscomposing(utf_ptr2char(p_extra + 1))) { + && !utf_iscomposing_first(utf_ptr2char(p_extra + 1))) { if (REPLACE_NORMAL(State)) { replace_push(*p_extra); } diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c index 8149c5964f..7413d33fe4 100644 --- a/src/nvim/digraph.c +++ b/src/nvim/digraph.c @@ -1865,7 +1865,7 @@ static void printdigraph(const digr_T *dp, result_T *previous) p = buf; // add a space to draw a composing char on - if (utf_iscomposing(dp->result)) { + if (utf_iscomposing_first(dp->result)) { *p++ = ' '; } p += utf_char2bytes(dp->result, p); diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c index 8a948716e5..b5273a54ca 100644 --- a/src/nvim/drawline.c +++ b/src/nvim/drawline.c @@ -1826,7 +1826,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s // If a double-width char doesn't fit display a '>' in the last column. // Don't advance the pointer but put the character at the start of the next line. - if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { + if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) { mb_c = '>'; mb_l = 1; (void)mb_l; @@ -1922,7 +1922,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s // If a double-width char doesn't fit display a '>' in the // last column; the character is displayed at the start of the // next line. - if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { + if (wlv.col >= grid->cols - 1 && schar_cells(mb_schar) == 2) { mb_schar = schar_from_ascii('>'); mb_c = '>'; mb_l = 1; @@ -2393,6 +2393,12 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s || (decor_conceal && decor_state.conceal_char) || wp->w_p_cole == 1) && wp->w_p_cole != 3) { + if (schar_cells(mb_schar) > 1) { + // When the first char to be concealed is double-width, + // need to advance one more virtual column. + wlv.n_extra++; + } + // First time at this concealed item: display one // character. if (has_match_conc && match_conc) { @@ -2410,12 +2416,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s mb_schar = schar_from_ascii(' '); } - if (utf_char2cells(mb_c) > 1) { - // When the first char to be concealed is double-width, - // need to advance one more virtual column. - wlv.n_extra++; - } - mb_c = schar_get_first_codepoint(mb_schar); prev_syntax_id = syntax_seqnr; @@ -2484,7 +2484,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s && mb_schar != NUL) { mb_schar = wp->w_p_lcs_chars.prec; lcs_prec_todo = NUL; - if (utf_char2cells(mb_c) > 1) { + if (schar_cells(mb_schar) > 1) { // Double-width character being overwritten by the "precedes" // character, need to fill up half the character. wlv.sc_extra = schar_from_ascii(MB_FILLER_CHAR); @@ -2725,7 +2725,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s linebuf_vcol[wlv.off] = wlv.vcol; - if (utf_char2cells(mb_c) > 1) { + if (schar_cells(mb_schar) > 1) { // Need to fill two screen columns. wlv.off++; wlv.col++; @@ -2744,7 +2744,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, int col_rows, s wlv.off++; wlv.col++; } else if (wp->w_p_cole > 0 && is_concealing) { - bool concealed_wide = utf_char2cells(mb_c) > 1; + bool concealed_wide = schar_cells(mb_schar) > 1; wlv.skip_cells--; wlv.vcol_off_co++; diff --git a/src/nvim/edit.c b/src/nvim/edit.c index 00ce38c4b1..f8723f9680 100644 --- a/src/nvim/edit.c +++ b/src/nvim/edit.c @@ -2832,6 +2832,8 @@ int replace_push_mb(char *p) { int l = utfc_ptr2len(p); + // TODO(bfredl): stop doing this insantity and instead use utf_head_off() when popping. + // or just keep a secondary array with char byte lenghts for (int j = l - 1; j >= 0; j--) { replace_push(p[j]); } @@ -2911,7 +2913,9 @@ static void mb_replace_pop_ins(int cc) for (int i = 1; i < n; i++) { buf[i] = (uint8_t)replace_pop(); } - if (utf_iscomposing(utf_ptr2char((char *)buf))) { + // TODO(bfredl): by fixing replace_push_mb, upgrade to use + // the new composing algorithm + if (utf_iscomposing_legacy(utf_ptr2char((char *)buf))) { ins_bytes_len((char *)buf, (size_t)n); } else { // Not a composing char, put it back. @@ -3843,7 +3847,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) space_sci = sci; space_vcol = vcol; } - vcol += charsize_nowrap(curbuf, use_ts, vcol, sci.chr.value); + vcol += charsize_nowrap(curbuf, sci.ptr, use_ts, vcol, sci.chr.value); sci = utfc_next(sci); prev_space = cur_space; } @@ -3859,7 +3863,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) // Find the position to stop backspacing. // Use charsize_nowrap() so that virtual text and wrapping are ignored. while (true) { - int size = charsize_nowrap(curbuf, use_ts, space_vcol, space_sci.chr.value); + int size = charsize_nowrap(curbuf, space_sci.ptr, use_ts, space_vcol, space_sci.chr.value); if (space_vcol + size > want_vcol) { break; } @@ -3930,7 +3934,7 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) bool has_composing = false; if (p_deco) { char *p0 = get_cursor_pos_ptr(); - has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0)); + has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0), NULL); } del_char(false); // If there are combining characters and 'delcombine' is set diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c index 6ac73527ee..1b6861f750 100644 --- a/src/nvim/ex_cmds.c +++ b/src/nvim/ex_cmds.c @@ -204,7 +204,7 @@ void do_ascii(exarg_T *eap) IObuff[iobuff_len++] = ' '; } IObuff[iobuff_len++] = '<'; - if (utf_iscomposing(c)) { + if (utf_iscomposing_first(c)) { IObuff[iobuff_len++] = ' '; // Draw composing char on top of a space. } iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len); diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c index 8a34e03d91..722a857f03 100644 --- a/src/nvim/ex_getln.c +++ b/src/nvim/ex_getln.c @@ -2118,7 +2118,7 @@ static int command_line_handle_key(CommandLineState *s) s->do_abbr = false; // don't do abbreviation now ccline.special_char = NUL; // may need to remove ^ when composing char was typed - if (utf_iscomposing(s->c) && !cmd_silent) { + if (utf_iscomposing_first(s->c) && !cmd_silent) { if (ui_has(kUICmdline)) { // TODO(bfredl): why not make unputcmdline also work with true? unputcmdline(); @@ -3585,7 +3585,9 @@ void put_on_cmdline(const char *str, int len, bool redraw) // backup to the character before it. There could be two of them. int i = 0; int c = utf_ptr2char(ccline.cmdbuff + ccline.cmdpos); - while (ccline.cmdpos > 0 && utf_iscomposing(c)) { + // TODO(bfredl): this can be corrected/simplified as utf_head_off implements the + // correct grapheme cluster breaks + while (ccline.cmdpos > 0 && utf_iscomposing_legacy(c)) { i = utf_head_off(ccline.cmdbuff, ccline.cmdbuff + ccline.cmdpos - 1) + 1; ccline.cmdpos -= i; len += i; diff --git a/src/nvim/grid.c b/src/nvim/grid.c index 56246bf001..acb336c725 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -186,6 +186,24 @@ size_t schar_len(schar_T sc) } } +int schar_cells(schar_T sc) +{ + // hot path +#ifdef ORDER_BIG_ENDIAN + if (!(sc & 0x80FFFFFF)) { + return 1; + } +#else + if (sc < 0x80) { + return 1; + } +#endif + + char sc_buf[MAX_SCHAR_SIZE]; + schar_get(sc_buf, sc); + return utf_ptr2cells(sc_buf); +} + /// gets first raw UTF-8 byte of an schar static char schar_get_first_byte(schar_T sc) { @@ -428,14 +446,19 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) const int max_col = grid_line_maxcol; while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) { // check if this is the first byte of a multibyte - int mbyte_blen = len > 0 - ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr)) - : utfc_ptr2len(ptr); + int mbyte_blen; + if (len >= 0) { + int maxlen = (int)((text + len) - ptr); + mbyte_blen = utfc_ptr2len_len(ptr, maxlen); + if (mbyte_blen > maxlen) { + mbyte_blen = 1; + } + } else { + mbyte_blen = utfc_ptr2len(ptr); + } int firstc; - schar_T schar = len >= 0 - ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc) - : utfc_ptr2schar(ptr, &firstc); - int mbyte_cells = utf_char2cells(firstc); + schar_T schar = utfc_ptrlen2schar(ptr, mbyte_blen, &firstc); + int mbyte_cells = utf_ptr2cells_len(ptr, mbyte_blen); if (mbyte_cells > 2 || schar == 0) { mbyte_cells = 1; schar = schar_from_char(0xFFFD); diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 0c1b537f3a..666a904fc5 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -511,20 +511,30 @@ int utf_char2cells(int c) /// Return the number of display cells character at "*p" occupies. /// This doesn't take care of unprintable characters, use ptr2cells() for that. -int utf_ptr2cells(const char *p) +int utf_ptr2cells(const char *p_in) { + const uint8_t *p = (const uint8_t *)p_in; // Need to convert to a character number. - if ((uint8_t)(*p) >= 0x80) { - int c = utf_ptr2char(p); + if ((*p) >= 0x80) { + int len = utf8len_tab[*p]; + int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len); // An illegal byte is displayed as <xx>. - if (utf_ptr2len(p) == 1 || c == NUL) { + if (c <= 0) { return 4; } // If the char is ASCII it must be an overlong sequence. if (c < 0x80) { return char2cells(c); } - return utf_char2cells(c); + int cells = utf_char2cells(c); + if (cells == 1 && p_emoji + && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + int c2 = utf_ptr2char(p_in + len); + if (c2 == 0xFE0F) { + return 2; // emoji presentation + } + } + return cells; } return 1; } @@ -603,7 +613,8 @@ int utf_ptr2cells_len(const char *p, int size) { // Need to convert to a wide character. if (size > 0 && (uint8_t)(*p) >= 0x80) { - if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) { + int len = utf_ptr2len_len(p, size); + if (len < utf8len_tab[(uint8_t)(*p)]) { return 1; // truncated } int c = utf_ptr2char(p); @@ -615,7 +626,16 @@ int utf_ptr2cells_len(const char *p, int size) if (c < 0x80) { return char2cells(c); } - return utf_char2cells(c); + int cells = utf_char2cells(c); + if (cells == 1 && p_emoji && size > len + && intable(emoji_all, ARRAY_SIZE(emoji_all), c) + && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) { + int c2 = utf_ptr2char(p + len); + if (c2 == 0xFE0F) { + return 2; // emoji presentation + } + } + return cells; } return 1; } @@ -648,8 +668,8 @@ size_t mb_string2cells_len(const char *str, size_t size) size_t clen = 0; for (const char *p = str; *p != NUL && p < str + size; - p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) { - clen += (size_t)utf_ptr2cells(p); + p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) { + clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str)); } return clen; @@ -793,29 +813,48 @@ int mb_cptr2char_adv(const char **pp) return c; } +/// When "c" is the first char of a string, determine if it needs to be prefixed +/// by a space byte to be drawn correctly, and not merge with the space left of +/// the string. +bool utf_iscomposing_first(int c) +{ + return c >= 128 && !utf8proc_grapheme_break(' ', c); +} + /// Check if the character pointed to by "p2" is a composing character when it -/// comes after "p1". For Arabic sometimes "ab" is replaced with "c", which -/// behaves like a composing character. -bool utf_composinglike(const char *p1, const char *p2) +/// comes after "p1". +/// +/// We use the definition in UAX#29 as implemented by utf8proc with the following +/// exceptions: +/// +/// - ASCII chars always begin a new cluster. This is a long assumed invariant +/// in the code base and very useful for performance (we can exit early for ASCII +/// all over the place, branch predictor go brrr in ASCII-only text). +/// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII, +/// which should be exceedingly rare (these PREPEND chars are expected to be +/// followed by multibyte chars within the same script family) +/// +/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with +/// "c" taking one single cell, which behaves like a cluster. +/// +/// @param "state" should be set to GRAPHEME_STATE_INIT before first call +/// it is allowed to be null, but will then not handle some longer +/// sequences, like ZWJ based emoji +bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state) + FUNC_ATTR_NONNULL_ARG(1, 2) { - int c2 = utf_ptr2char(p2); - if (utf_iscomposing(c2)) { - return true; - } - if (!arabic_maycombine(c2)) { + if ((uint8_t)(*p2) < 128) { return false; } - return arabic_combine(utf_ptr2char(p1), c2); -} -/// Check if the next character is a composing character when it -/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which -/// behaves like a composing character. -/// returns false for negative values -bool utf_char_composinglike(int32_t const first, int32_t const next) - FUNC_ATTR_PURE -{ - return utf_iscomposing(next) || arabic_combine(first, next); + int first = utf_ptr2char(p1); + int second = utf_ptr2char(p2); + + if (!utf8proc_grapheme_break_stateful(first, second, state)) { + return true; + } + + return arabic_combine(first, second); } /// Get the screen char at the beginning of a string @@ -834,7 +873,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc) { int c = utf_ptr2char(p); *firstc = c; // NOT optional, you are gonna need it - bool first_compose = utf_iscomposing(c); + bool first_compose = utf_iscomposing_first(c); size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); @@ -845,16 +884,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc) return schar_from_buf_first(p, len, first_compose); } -/// Get the screen char at the beginning of a string with length +/// Get the screen char from a char with a known length /// /// Like utfc_ptr2schar but use no more than p[maxlen]. -schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) +schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc) FUNC_ATTR_NONNULL_ALL { - assert(maxlen > 0); - - size_t len = (size_t)utf_ptr2len_len(p, maxlen); - if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { + if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { // invalid or truncated sequence *firstc = (uint8_t)(*p); return 0; @@ -862,11 +898,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) int c = utf_ptr2char(p); *firstc = c; - bool first_compose = utf_iscomposing(c); - maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose); - len = (size_t)utfc_ptr2len_len(p, maxlen); + bool first_compose = utf_iscomposing_first(c); + int maxlen = MAX_SCHAR_SIZE - 1 - first_compose; + if (len > maxlen) { + len = utfc_ptr2len_len(p, maxlen); + } - return schar_from_buf_first(p, len, first_compose); + return schar_from_buf_first(p, (size_t)len, first_compose); } /// Caller must ensure there is space for `first_compose` @@ -964,8 +1002,9 @@ int utfc_ptr2len(const char *const p) // Check for composing characters. int prevlen = 0; + GraphemeState state = GRAPHEME_STATE_INIT; while (true) { - if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { + if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) { return len; } @@ -996,9 +1035,10 @@ int utfc_ptr2len_len(const char *p, int size) return 1; } - // Check for composing characters. We can handle only the first six, but + // Check for composing characters. We can only display a limited amount, but // skip all of them (otherwise the cursor would get stuck). int prevlen = 0; + GraphemeState state = GRAPHEME_STATE_INIT; while (len < size) { if ((uint8_t)p[len] < 0x80) { break; @@ -1011,7 +1051,7 @@ int utfc_ptr2len_len(const char *p, int size) break; } - if (!utf_composinglike(p + prevlen, p + len)) { + if (!utf_composinglike(p + prevlen, p + len, &state)) { break; } @@ -1084,11 +1124,18 @@ int utf_char2bytes(const int c, char *const buf) } } -/// Return true if "c" is a composing UTF-8 character. -/// This means it will be drawn on top of the preceding character. +/// Return true if "c" is a legacy composing UTF-8 character. +/// +/// This is deprecated in favour of utf_composinglike() which uses the modern +/// stateful algorithm to determine grapheme clusters. Still available +/// to support some legacy code which hasn't been refactored yet. +/// +/// To check if a char would combine with a preceeding space, use +/// utf_iscomposing_first() instead. +/// /// Based on code from Markus Kuhn. /// Returns false for negative values. -bool utf_iscomposing(int c) +bool utf_iscomposing_legacy(int c) { return intable(combining, ARRAY_SIZE(combining), c); } @@ -1278,8 +1325,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 2; } -bool utf_ambiguous_width(int c) +bool utf_ambiguous_width(const char *p) { + int c = utf_ptr2char(p); return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c) || intable(emoji_all, ARRAY_SIZE(emoji_all), c)); } @@ -1666,6 +1714,26 @@ void show_utf8(void) msg(IObuff, 0); } +/// @return true if boundclass bc always starts a new cluster regardless of what's before +/// false negatives are allowed (perf cost, not correctness) +static bool always_break(int bc) +{ + return (bc == UTF8PROC_BOUNDCLASS_CONTROL); +} + +/// @return true if bc2 always starts a cluster after bc1 +/// false negatives are allowed (perf cost, not correctness) +static bool always_break_two(int bc1, int bc2) +{ + // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by + // "always_break" on first iteration or when it was bc1 in the previous iteration + return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER) + || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL) + || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + && (bc1 == UTF8PROC_BOUNDCLASS_OTHER + || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC))); +} + /// Return offset from "p" to the start of a character, including composing characters. /// "base" must be the start of the string, which must be NUL terminated. /// If "p" points to the NUL at the end of the string return 0. @@ -1679,50 +1747,108 @@ int utf_head_off(const char *base_in, const char *p_in) const uint8_t *base = (uint8_t *)base_in; const uint8_t *p = (uint8_t *)p_in; - // Skip backwards over trailing bytes: 10xx.xxxx - // Skip backwards again if on a composing char. - const uint8_t *q; - for (q = p;; q--) { - // Move s to the last byte of this char. - const uint8_t *s; - for (s = q; (s[1] & 0xc0) == 0x80; s++) {} - - // Move q to the first byte of this char. - while (q > base && (*q & 0xc0) == 0x80) { - q--; - } - // Check for illegal sequence. Do allow an illegal byte after where we - // started. - int len = utf8len_tab[*q]; - if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) { - return 0; + const uint8_t *start = p; + + // move start to the first byte of this codepoint + // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl + while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) { + start--; + } + + uint8_t cur_len = utf8len_tab[*start]; + int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)cur_len); + if (cur_code < 0) { + return 0; // p must be part of an illegal sequence + } + const uint8_t * const safe_end = start + cur_len; + + int cur_bc = utf8proc_get_property(cur_code)->boundclass; + if (always_break(cur_bc)) { + return (int)(p - start); + } + + // backtrack to find the start of a cluster. we might go too far, checked in the next loop + const uint8_t *cur_pos = start; + const uint8_t *const p_start = start; + + if (start == base) { + return (int)(p - start); + } + + start--; + while (*start >= 0x80) { // stop on ascii, we are done + while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) { + start--; } - if (q <= base) { + int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)utf8len_tab[*start]); + if (prev_code < 0) { + start = cur_pos; // start at valid sequence after invalid bytes break; } - int c = utf_ptr2char((char *)q); - if (utf_iscomposing(c)) { - continue; + int prev_bc = utf8proc_get_property(prev_code)->boundclass; + if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) { + start = cur_pos; // prev_code cannot be a part of this cluster + break; + } else if (start == base) { + break; } + cur_pos = start; + cur_bc = prev_bc; + cur_code = prev_code; - if (arabic_maycombine(c)) { - // Advance to get a sneak-peak at the next char - const uint8_t *j = q; - j--; - // Move j to the first byte of this char. - while (j > base && (*j & 0xc0) == 0x80) { - j--; - } - if (arabic_combine(utf_ptr2char((char *)j), c)) { - continue; - } + start--; + } + + // hot path: we are already on the first codepoint of a sequence + if (start == p_start) { + return (int)(p - start); + } + + const uint8_t *q = start; + while (q < p) { + // don't need to find end of cluster. once we reached the codepoint of p, we are done + int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q)); + + if (q + len > p) { + return (int)(p - q); } - break; + + q += len; } - return (int)(p - q); + return 0; +} + +/// Assumes caller already handles ascii. see `utfc_next` +StrCharInfo utfc_next_impl(StrCharInfo cur) +{ + int32_t prev_code = cur.chr.value; + uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); + GraphemeState state = GRAPHEME_STATE_INIT; + assert(*next >= 0x80); + + while (true) { + uint8_t const next_len = utf8len_tab[*next]; + int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); + if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state) + && !arabic_combine(prev_code, next_code)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, + }; + } + + prev_code = next_code; + next += next_len; + if (EXPECT(*next < 0x80U, true)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = *next, .len = 1 }, + }; + } + } } // Whether space is NOT allowed before/after 'c'. @@ -2681,7 +2807,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si c = 0x100; break; // not in latin9 } } - if (!utf_iscomposing(c)) { // skip composing chars + if (!utf_iscomposing_legacy(c)) { // skip composing chars if (c < 0x100) { *d++ = (uint8_t)c; } else if (vcp->vc_fail) { diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h index 6cbfbcbc3c..2da051fca2 100644 --- a/src/nvim/mbyte.h +++ b/src/nvim/mbyte.h @@ -3,6 +3,7 @@ #include <stdbool.h> #include <stdint.h> #include <sys/types.h> // IWYU pragma: keep +#include <utf8proc.h> #include <uv.h> // IWYU pragma: keep #include "nvim/cmdexpand_defs.h" // IWYU pragma: keep @@ -11,6 +12,9 @@ #include "nvim/mbyte_defs.h" // IWYU pragma: keep #include "nvim/types_defs.h" // IWYU pragma: keep +typedef utf8proc_int32_t GraphemeState; +#define GRAPHEME_STATE_INIT 0 + #ifdef INCLUDE_GENERATED_DECLARATIONS # include "mbyte.h.generated.h" # include "mbyte.h.inline.generated.h" @@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in) static inline StrCharInfo utfc_next(StrCharInfo cur) FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE { - int32_t prev_code = cur.chr.value; + // handle ASCII case inline uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len); - - while (true) { - if (EXPECT(*next < 0x80U, true)) { - return (StrCharInfo){ - .ptr = (char *)next, - .chr = (CharInfo){ .value = *next, .len = 1 }, - }; - } - uint8_t const next_len = utf8len_tab[*next]; - int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len); - if (!utf_char_composinglike(prev_code, next_code)) { - return (StrCharInfo){ - .ptr = (char *)next, - .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) }, - }; - } - - prev_code = next_code; - next += next_len; + if (EXPECT(*next < 0x80U, true)) { + return (StrCharInfo){ + .ptr = (char *)next, + .chr = (CharInfo){ .value = *next, .len = 1 }, + }; } + + return utfc_next_impl(cur); } static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr) diff --git a/src/nvim/message.c b/src/nvim/message.c index 53e5511a5a..79e6bc8be7 100644 --- a/src/nvim/message.c +++ b/src/nvim/message.c @@ -446,9 +446,7 @@ void trunc_string(const char *s, char *buf, int room_in, int buflen) // Last part: End of the string. half = i = (int)strlen(s); while (true) { - do { - half = half - utf_head_off(s, s + half - 1) - 1; - } while (half > 0 && utf_iscomposing(utf_ptr2char(s + half))); + half = half - utf_head_off(s, s + half - 1) - 1; n = ptr2cells(s + half); if (len + n > room || half == 0) { break; diff --git a/src/nvim/normal.c b/src/nvim/normal.c index f44a64af21..f3bdea9a85 100644 --- a/src/nvim/normal.c +++ b/src/nvim/normal.c @@ -837,7 +837,10 @@ static void normal_get_additional_char(NormalState *s) while ((s->c = vpeekc()) > 0 && (s->c >= 0x100 || MB_BYTE2LEN(vpeekc()) > 1)) { s->c = plain_vgetc(); - if (!utf_iscomposing(s->c)) { + // TODO(bfredl): only allowing up to two composing chars is cringe af. + // Could reuse/abuse schar_T to at least allow us to input anything we are able + // to display and use the stateful utf8proc algorithm like utf_composinglike + if (!utf_iscomposing_legacy(s->c)) { vungetc(s->c); // it wasn't, put it back break; } else if (s->ca.ncharC1 == 0) { diff --git a/src/nvim/options.lua b/src/nvim/options.lua index 3612a80fb8..1c17b0fc9f 100644 --- a/src/nvim/options.lua +++ b/src/nvim/options.lua @@ -2326,9 +2326,12 @@ return { desc = [=[ When on all Unicode emoji characters are considered to be full width. This excludes "text emoji" characters, which are normally displayed as - single width. Unfortunately there is no good specification for this - and it has been determined on trial-and-error basis. Use the - |setcellwidths()| function to change the behavior. + single width. However, such "text emoji" are treated as full-width + emoji if they are followed by the U+FE0F variant selector. + + Unfortunately there is no good specification for this and it has been + determined on trial-and-error basis. Use the |setcellwidths()| + function to change the behavior. ]=], full_name = 'emoji', redraw = { 'all_windows', 'ui_option' }, diff --git a/src/nvim/plines.c b/src/nvim/plines.c index e51e9bf8c3..408fe26bf3 100644 --- a/src/nvim/plines.c +++ b/src/nvim/plines.c @@ -146,7 +146,7 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco } else if (cur_char < 0) { size = kInvalidByteCells; } else { - size = char2cells(cur_char); + size = ptr2cells(cur); is_doublewidth = size == 2 && cur_char > 0x80; } @@ -337,8 +337,8 @@ CharSize charsize_regular(CharsizeArg *csarg, char *const cur, colnr_T const vco /// /// @see charsize_regular /// @see charsize_fast -static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, colnr_T const vcol, - int32_t const cur_char) +static inline CharSize charsize_fast_impl(win_T *const wp, const char *cur, bool use_tabstop, + colnr_T const vcol, int32_t const cur_char) FUNC_ATTR_PURE FUNC_ATTR_ALWAYS_INLINE { // A tab gets expanded, depending on the current column @@ -352,7 +352,11 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col if (cur_char < 0) { width = kInvalidByteCells; } else { - width = char2cells(cur_char); + // TODO(bfredl): perf: often cur_char is enough at this point to determine width. + // we likely want a specialized version of utf_ptr2StrCharInfo also determining + // the ptr2cells width at the same time without any extra decoding. (also applies + // to charsize_regular and charsize_nowrap) + width = ptr2cells(cur); } // If a double-width char doesn't fit at the end of a line, it wraps to the next line, @@ -371,23 +375,23 @@ static inline CharSize charsize_fast_impl(win_T *const wp, bool use_tabstop, col /// Can be used if CSType is kCharsizeFast. /// /// @see charsize_regular -CharSize charsize_fast(CharsizeArg *csarg, colnr_T const vcol, int32_t const cur_char) +CharSize charsize_fast(CharsizeArg *csarg, const char *cur, colnr_T vcol, int32_t cur_char) FUNC_ATTR_PURE { - return charsize_fast_impl(csarg->win, csarg->use_tabstop, vcol, cur_char); + return charsize_fast_impl(csarg->win, cur, csarg->use_tabstop, vcol, cur_char); } /// Get the number of cells taken up on the screen at given virtual column. /// /// @see win_chartabsize() -int charsize_nowrap(buf_T *buf, bool use_tabstop, colnr_T vcol, int32_t cur_char) +int charsize_nowrap(buf_T *buf, const char *cur, bool use_tabstop, colnr_T vcol, int32_t cur_char) { if (cur_char == TAB && use_tabstop) { return tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array); } else if (cur_char < 0) { return kInvalidByteCells; } else { - return char2cells(cur_char); + return ptr2cells(cur); } } @@ -467,7 +471,7 @@ int linesize_fast(CharsizeArg const *const csarg, int vcol_arg, colnr_T const le StrCharInfo ci = utf_ptr2StrCharInfo(line); while (ci.ptr - line < len && *ci.ptr != NUL) { - vcol += charsize_fast_impl(wp, use_tabstop, vcol_arg, ci.chr.value).width; + vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol_arg, ci.chr.value).width; ci = utfc_next(ci); if (vcol > MAXCOL) { vcol_arg = MAXCOL; @@ -530,7 +534,7 @@ void getvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *en char_size = (CharSize){ .width = 1 }; break; } - char_size = charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value); + char_size = charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value); StrCharInfo const next = utfc_next(ci); if (next.ptr - line > end_col) { break; @@ -627,7 +631,7 @@ void getvvcol(win_T *wp, pos_T *pos, colnr_T *start, colnr_T *cursor, colnr_T *e if (pos->col < ml_get_buf_len(wp->w_buffer, pos->lnum)) { int c = utf_ptr2char(ptr + pos->col); if ((c != TAB) && vim_isprintc(c)) { - endadd = (colnr_T)(char2cells(c) - 1); + endadd = (colnr_T)(ptr2cells(ptr + pos->col) - 1); if (coladd > endadd) { // past end of line endadd = 0; @@ -824,7 +828,7 @@ int plines_win_col(win_T *wp, linenr_T lnum, long column) if (cstype == kCharsizeFast) { bool const use_tabstop = csarg.use_tabstop; while (*ci.ptr != NUL && --column >= 0) { - vcol += charsize_fast_impl(wp, use_tabstop, vcol, ci.chr.value).width; + vcol += charsize_fast_impl(wp, ci.ptr, use_tabstop, vcol, ci.chr.value).width; ci = utfc_next(ci); } } else { diff --git a/src/nvim/plines.h b/src/nvim/plines.h index 7128e37237..50310b8ce1 100644 --- a/src/nvim/plines.h +++ b/src/nvim/plines.h @@ -54,7 +54,7 @@ static inline CharSize win_charsize(CSType cstype, int vcol, char *ptr, int32_t FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE { if (cstype == kCharsizeFast) { - return charsize_fast(csarg, vcol, chr); + return charsize_fast(csarg, ptr, vcol, chr); } else { return charsize_regular(csarg, ptr, vcol, chr); } diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 7dbbb19545..c91c112c3c 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c) { return utf_char2len(c) > 1 && (re_multi_type(peekchr()) != NOT_MULTI - || utf_iscomposing(c)); + || utf_iscomposing_legacy(c)); } // Emit (if appropriate) a byte of code @@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp) } // When '.' is followed by a composing char ignore the dot, so that // the composing char is matched here. - if (c == Magic('.') && utf_iscomposing(peekchr())) { + if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { c = getchr(); goto do_multibyte; } @@ -5001,9 +5001,10 @@ do_multibyte: int l; // Need to get composing character too. + GraphemeState state = GRAPHEME_STATE_INIT; while (true) { l = utf_ptr2len(regparse); - if (!utf_composinglike(regparse, regparse + l)) { + if (!utf_composinglike(regparse, regparse + l, &state)) { break; } regmbc(utf_ptr2char(regparse)); @@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) // Check for following composing character, unless %C // follows (skips over all composing chars). if (status != RA_NOMATCH - && utf_composinglike((char *)rex.input, (char *)rex.input + len) + && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL) && !rex.reg_icombine && OP(next) != RE_COMPOSING) { // raaron: This code makes a composing character get @@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) break; } const int opndc = utf_ptr2char((char *)opnd); - if (utf_iscomposing(opndc)) { + if (utf_iscomposing_legacy(opndc)) { // When only a composing char is given match at any // position where that composing char appears. status = RA_NOMATCH; for (i = 0; rex.input[i] != NUL; i += utf_ptr2len((char *)rex.input + i)) { const int inpc = utf_ptr2char((char *)rex.input + i); - if (!utf_iscomposing(inpc)) { + if (!utf_iscomposing_legacy(inpc)) { if (i > 0) { break; } @@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) case RE_COMPOSING: // Skip composing characters. - while (utf_iscomposing(utf_ptr2char((char *)rex.input))) { + while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) { rex.input += utf_ptr2len((char *)rex.input); } break; @@ -10070,7 +10071,7 @@ static int nfa_regatom(void) } // When '.' is followed by a composing char ignore the dot, so that // the composing char is matched here. - if (c == Magic('.') && utf_iscomposing(peekchr())) { + if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) { old_regparse = (uint8_t *)regparse; c = getchr(); goto nfa_do_multibyte; @@ -10705,7 +10706,7 @@ collection: nfa_do_multibyte: // plen is length of current char with composing chars if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse)) - || utf_iscomposing(c)) { + || utf_iscomposing_legacy(c)) { int i = 0; // A base character plus composing characters, or just one @@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) } if (match // check that no composing char follows - && !utf_iscomposing(utf_ptr2char((char *)s2))) { + && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) { cleanup_subexpr(); if (REG_MULTI) { rex.reg_startpos[0].lnum = rex.lnum; @@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm // is not really a match. if (!rex.reg_icombine && rex.input != rex.line - && utf_iscomposing(curc)) { + && utf_iscomposing_legacy(curc)) { break; } nfa_match = true; @@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm sta = t->state->out; len = 0; - if (utf_iscomposing(sta->c)) { + if (utf_iscomposing_legacy(sta->c)) { // Only match composing character(s), ignore base // character. Used for ".{composing}" and "{composing}" // (no preceding character). @@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm int j; sta = t->state->out->out; - if (utf_iscomposing(sta->c)) { + if (utf_iscomposing_legacy(sta->c)) { // Only match composing character(s), ignore base // character. Used for ".{composing}" and "{composing}" // (no preceding character). @@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm case NFA_ANY_COMPOSING: // On a composing character skip over it. Otherwise do // nothing. Always matches. - if (utf_iscomposing(curc)) { + if (utf_iscomposing_legacy(curc)) { add_off = clen; } else { add_here = true; diff --git a/src/nvim/search.c b/src/nvim/search.c index 9e00664d86..ff6e135df1 100644 --- a/src/nvim/search.c +++ b/src/nvim/search.c @@ -1260,7 +1260,7 @@ int do_search(oparg_T *oap, int dirc, int search_delim, char *pat, size_t patlen // empty for the search_stat feature. if (!cmd_silent) { msgbuf[0] = (char)dirc; - if (utf_iscomposing(utf_ptr2char(p))) { + if (utf_iscomposing_first(utf_ptr2char(p))) { // Use a space to draw the composing char on. msgbuf[1] = ' '; memmove(msgbuf + 2, p, plen); diff --git a/src/nvim/sign.c b/src/nvim/sign.c index 9b2516ed83..b4ba7833e9 100644 --- a/src/nvim/sign.c +++ b/src/nvim/sign.c @@ -376,7 +376,7 @@ int init_sign_text(sign_T *sp, schar_T *sign_text, char *text) if (!vim_isprintc(c)) { break; } - int width = utf_char2cells(c); + int width = utf_ptr2cells(s); if (width == 2) { sign_text[cells + 1] = 0; } diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c index d6053a533e..b37f01e769 100644 --- a/src/nvim/spellsuggest.c +++ b/src/nvim/spellsuggest.c @@ -1792,10 +1792,8 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun // For changing a composing character adjust // the score from SCORE_SUBST to // SCORE_SUBCOMP. - if (utf_iscomposing(utf_ptr2char(tword + sp->ts_twordlen - - sp->ts_tcharlen)) - && utf_iscomposing(utf_ptr2char(fword - + sp->ts_fcharstart))) { + if (utf_iscomposing_legacy(utf_ptr2char(tword + sp->ts_twordlen - sp->ts_tcharlen)) + && utf_iscomposing_legacy(utf_ptr2char(fword + sp->ts_fcharstart))) { sp->ts_score -= SCORE_SUBST - SCORE_SUBCOMP; } else if (!soundfold && slang->sl_has_map @@ -1811,7 +1809,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun && sp->ts_twordlen > sp->ts_tcharlen) { p = tword + sp->ts_twordlen - sp->ts_tcharlen; c = utf_ptr2char(p); - if (utf_iscomposing(c)) { + if (utf_iscomposing_legacy(c)) { // Inserting a composing char doesn't // count that much. sp->ts_score -= SCORE_INS - SCORE_INSCOMP; @@ -1876,7 +1874,7 @@ static void suggest_trie_walk(suginfo_T *su, langp_T *lp, char *fword, bool soun c = utf_ptr2char(fword + sp->ts_fidx); stack[depth].ts_fidx = (uint8_t)(stack[depth].ts_fidx + utfc_ptr2len(fword + sp->ts_fidx)); - if (utf_iscomposing(c)) { + if (utf_iscomposing_legacy(c)) { stack[depth].ts_score -= SCORE_DEL - SCORE_DELCOMP; } else if (c == utf_ptr2char(fword + stack[depth].ts_fidx)) { stack[depth].ts_score -= SCORE_DEL - SCORE_DELDUP; diff --git a/src/nvim/textformat.c b/src/nvim/textformat.c index 96907362dd..30c7d0ee92 100644 --- a/src/nvim/textformat.c +++ b/src/nvim/textformat.c @@ -47,7 +47,7 @@ static bool did_add_space = false; ///< auto_format() added an extra space ///< under the cursor #define WHITECHAR(cc) (ascii_iswhite(cc) \ - && !utf_iscomposing(utf_ptr2char((char *)get_cursor_pos_ptr() + 1))) + && !utf_iscomposing_first(utf_ptr2char((char *)get_cursor_pos_ptr() + 1))) /// Return true if format option 'x' is in effect. /// Take care of no formatting when 'paste' is set. diff --git a/src/nvim/tui/tui.c b/src/nvim/tui/tui.c index 1866a4a592..7e1068ed56 100644 --- a/src/nvim/tui/tui.c +++ b/src/nvim/tui/tui.c @@ -109,6 +109,7 @@ struct TUIData { bool set_cursor_color_as_str; bool cursor_color_changed; bool is_starting; + bool did_set_grapheme_cluster_mode; FILE *screenshot; cursorentry_T cursor_shapes[SHAPE_IDX_COUNT]; HlAttrs clear_attrs; @@ -220,6 +221,7 @@ static void tui_set_term_mode(TUIData *tui, TermMode mode, bool set) void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) FUNC_ATTR_NONNULL_ALL { + bool is_set = false; switch (state) { case kTermModeNotRecognized: case kTermModePermanentlySet: @@ -228,6 +230,8 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) // then there is nothing to do break; case kTermModeSet: + is_set = true; + FALLTHROUGH; case kTermModeReset: // The terminal supports changing the given mode switch (mode) { @@ -240,6 +244,12 @@ void tui_handle_term_mode(TUIData *tui, TermMode mode, TermModeState state) signal_watcher_stop(&tui->winch_handle); tui_set_term_mode(tui, mode, true); break; + case kTermModeGraphemeClusters: + if (!is_set) { + tui_set_term_mode(tui, mode, true); + tui->did_set_grapheme_cluster_mode = true; + } + break; } } } @@ -434,6 +444,7 @@ static void terminfo_start(TUIData *tui) if (!nsterm) { tui_request_term_mode(tui, kTermModeSynchronizedOutput); tui_request_term_mode(tui, kTermModeResizeEvents); + tui_request_term_mode(tui, kTermModeGraphemeClusters); } // Don't use DECRQSS in screen or tmux, as they behave strangely when receiving it. @@ -494,7 +505,9 @@ static void terminfo_stop(TUIData *tui) // Disable resize events tui_set_term_mode(tui, kTermModeResizeEvents, false); - + if (tui->did_set_grapheme_cluster_mode) { + tui_set_term_mode(tui, kTermModeGraphemeClusters, false); + } // May restore old title before exiting alternate screen. tui_set_title(tui, NULL_STRING); if (ui_client_exit_status == 0) { @@ -1010,7 +1023,7 @@ static void print_cell_at_pos(TUIData *tui, int row, int col, UCell *cell, bool char buf[MAX_SCHAR_SIZE]; schar_get(buf, cell->data); int c = utf_ptr2char(buf); - bool is_ambiwidth = utf_ambiguous_width(c); + bool is_ambiwidth = utf_ambiguous_width(buf); if (is_doublewidth && (is_ambiwidth || utf_char2cells(c) == 1)) { // If the server used setcellwidths() to treat a single-width char as double-width, // it needs to be treated like an ambiguous-width char. diff --git a/src/nvim/tui/tui_defs.h b/src/nvim/tui/tui_defs.h index 46913e07a2..bd99d6b0ad 100644 --- a/src/nvim/tui/tui_defs.h +++ b/src/nvim/tui/tui_defs.h @@ -4,6 +4,7 @@ typedef struct TUIData TUIData; typedef enum { kTermModeSynchronizedOutput = 2026, + kTermModeGraphemeClusters = 2027, kTermModeResizeEvents = 2048, } TermMode; |