diff options
author | bfredl <bjorn.linse@gmail.com> | 2023-11-06 14:52:27 +0100 |
---|---|---|
committer | bfredl <bjorn.linse@gmail.com> | 2023-11-17 12:58:57 +0100 |
commit | b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254 (patch) | |
tree | 434ec27e069ba57406ce9f6d194627e95c3d315c | |
parent | 20ec4c776a07492c2e3b995e10b40b1cdb52bc7a (diff) | |
download | rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.gz rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.tar.bz2 rneovim-b522cb1ac3fbdf6e68eed5d0b6e1cbeaf3ac2254.zip |
refactor(grid): make screen rendering more multibyte than ever before
Problem: buffer text with composing chars are converted from UTF-8
to an array of up to seven UTF-32 values and then converted back
to UTF-8 strings.
Solution: Convert buffer text directly to UTF-8 based schar_T values.
The limit of the text size is now in schar_T bytes, which is currently
31+1 but easily could be raised as it no longer multiplies the size
of the entire screen grid when not used, the full size is only required
for temporary scratch buffers.
Also does some general cleanup to win_line text handling, which was
unnecessarily complicated due to multibyte rendering being an "opt-in"
feature long ago. Nowadays, a char is just a char, regardless if it consists
of one ASCII byte or multiple bytes.
-rw-r--r-- | runtime/doc/mbyte.txt | 3 | ||||
-rw-r--r-- | runtime/doc/news.txt | 7 | ||||
-rw-r--r-- | runtime/doc/vim_diff.txt | 13 | ||||
-rw-r--r-- | runtime/lua/vim/_meta/options.lua | 2 | ||||
-rw-r--r-- | src/nvim/change.c | 11 | ||||
-rw-r--r-- | src/nvim/charset.c | 19 | ||||
-rw-r--r-- | src/nvim/digraph.c | 2 | ||||
-rw-r--r-- | src/nvim/drawline.c | 261 | ||||
-rw-r--r-- | src/nvim/edit.c | 17 | ||||
-rw-r--r-- | src/nvim/eval.c | 2 | ||||
-rw-r--r-- | src/nvim/eval/funcs.c | 33 | ||||
-rw-r--r-- | src/nvim/ex_cmds.c | 110 | ||||
-rw-r--r-- | src/nvim/grid.c | 65 | ||||
-rw-r--r-- | src/nvim/grid_defs.h | 6 | ||||
-rw-r--r-- | src/nvim/insexpand.c | 2 | ||||
-rw-r--r-- | src/nvim/lua/stdlib.c | 2 | ||||
-rw-r--r-- | src/nvim/match.c | 2 | ||||
-rw-r--r-- | src/nvim/mbyte.c | 123 | ||||
-rw-r--r-- | src/nvim/mbyte.h | 1 | ||||
-rw-r--r-- | src/nvim/message.c | 10 | ||||
-rw-r--r-- | src/nvim/option_vars.h | 1 | ||||
-rw-r--r-- | src/nvim/spellsuggest.c | 2 | ||||
-rw-r--r-- | test/functional/ui/fold_spec.lua | 38 | ||||
-rw-r--r-- | test/functional/ui/multibyte_spec.lua | 30 | ||||
-rw-r--r-- | test/functional/ui/output_spec.lua | 4 | ||||
-rw-r--r-- | test/unit/mbyte_spec.lua | 243 |
26 files changed, 403 insertions, 606 deletions
diff --git a/runtime/doc/mbyte.txt b/runtime/doc/mbyte.txt index aedef87a09..0a7e0baad3 100644 --- a/runtime/doc/mbyte.txt +++ b/runtime/doc/mbyte.txt @@ -646,7 +646,8 @@ widespread as file format. A composing or combining character is used to change the meaning of the character before it. The combining characters are drawn on top of the preceding character. -Up to six combining characters can be displayed. +Too big combined characters cannot be displayed, but they can still be +inspected using the |g8| and |ga| commands described below. When editing text a composing character is mostly considered part of the preceding character. For example "x" will delete a character and its following composing characters by default. diff --git a/runtime/doc/news.txt b/runtime/doc/news.txt index 2f48ebfeff..cb3220a630 100644 --- a/runtime/doc/news.txt +++ b/runtime/doc/news.txt @@ -294,6 +294,13 @@ The following changes to existing APIs or features add new behavior. Note that syntax highlighting of code examples requires a matching parser and may be affected by custom queries. +• Support for rendering multibyte characters using composing characters has been + enhanced. The maximum limit have been increased from 1+6 codepoints to + 31 bytes, which is guaranteed to fit all chars from before but often more. + + NOTE: the regexp engine still has a hard-coded limit of considering + 6 composing chars only. + ============================================================================== REMOVED FEATURES *news-removed* diff --git a/runtime/doc/vim_diff.txt b/runtime/doc/vim_diff.txt index 05d7e5feb9..5e09cc2481 100644 --- a/runtime/doc/vim_diff.txt +++ b/runtime/doc/vim_diff.txt @@ -722,9 +722,16 @@ Options: < *'macatsui'* *'maxcombine'* *'mco'* - Nvim always displays up to 6 combining characters. You can still edit - text with more than 6 combining characters, you just can't see them. - Use |g8| or |ga|. See |mbyte-combining|. + Nvim counts maximum character sizes in bytes, not codepoints. This is + guaranteed to be big enough to always fit all chars properly displayed + in vim with 'maxcombine' set to 6. + + You can still edit text with larger characters than fits in the screen buffer, + you just can't see them. Use |g8| or |ga|. See |mbyte-combining|. + + NOTE: the rexexp engine still has a hard-coded limit of considering + 6 composing chars only. + *'maxmem'* Nvim delegates memory-management to the OS. *'maxmemtot'* Nvim delegates memory-management to the OS. printoptions diff --git a/runtime/lua/vim/_meta/options.lua b/runtime/lua/vim/_meta/options.lua index 19ae786177..be4a4dd49c 100644 --- a/runtime/lua/vim/_meta/options.lua +++ b/runtime/lua/vim/_meta/options.lua @@ -2576,7 +2576,7 @@ vim.go.fp = vim.go.formatprg --- security reasons. --- --- @type boolean -vim.o.fsync = false +vim.o.fsync = true vim.o.fs = vim.o.fsync vim.go.fsync = vim.o.fsync vim.go.fs = vim.go.fsync diff --git a/src/nvim/change.c b/src/nvim/change.c index 58718811bc..aa58779f5b 100644 --- a/src/nvim/change.c +++ b/src/nvim/change.c @@ -665,7 +665,7 @@ void ins_bytes_len(char *p, size_t len) /// convert bytes to a character. void ins_char(int c) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; size_t n = (size_t)utf_char2bytes(c, buf); // When "c" is 0x100, 0x200, etc. we don't want to insert a NUL byte. @@ -869,12 +869,9 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine) // If 'delcombine' is set and deleting (less than) one character, only // delete the last combining character. - if (p_deco && use_delcombine - && utfc_ptr2len(oldp + col) >= count) { - int cc[MAX_MCO]; - - (void)utfc_ptr2char(oldp + col, cc); - if (cc[0] != NUL) { + if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) { + char *p0 = oldp + col; + if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) { // Find the last composing char, there can be several. int n = col; do { diff --git a/src/nvim/charset.c b/src/nvim/charset.c index 0adcc09ec7..5dfc9c444d 100644 --- a/src/nvim/charset.c +++ b/src/nvim/charset.c @@ -302,15 +302,13 @@ size_t transstr_len(const char *const s, bool untab) while (*p) { const size_t l = (size_t)utfc_ptr2len(p); if (l > 1) { - int pcc[MAX_MCO + 1]; - pcc[0] = utfc_ptr2char(p, &pcc[1]); - - if (vim_isprintc(pcc[0])) { + if (vim_isprintc(utf_ptr2char(p))) { len += l; } else { - for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) { + for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) { + int c = utf_ptr2char(p + off); char hexbuf[9]; - len += transchar_hex(hexbuf, pcc[i]); + len += transchar_hex(hexbuf, c); } } p += l; @@ -349,16 +347,15 @@ size_t transstr_buf(const char *const s, const ssize_t slen, char *const buf, co if (buf_p + l > buf_e) { break; // Exceeded `buf` size. } - int pcc[MAX_MCO + 1]; - pcc[0] = utfc_ptr2char(p, &pcc[1]); - if (vim_isprintc(pcc[0])) { + if (vim_isprintc(utf_ptr2char(p))) { memmove(buf_p, p, l); buf_p += l; } else { - for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) { + for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) { + int c = utf_ptr2char(p + off); char hexbuf[9]; // <up to 6 bytes>NUL - const size_t hexlen = transchar_hex(hexbuf, pcc[i]); + const size_t hexlen = transchar_hex(hexbuf, c); if (buf_p + hexlen > buf_e) { break; } diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c index bc0ce99c5e..1bff78f90a 100644 --- a/src/nvim/digraph.c +++ b/src/nvim/digraph.c @@ -1654,7 +1654,7 @@ static void registerdigraph(int char1, int char2, int n) bool check_digraph_chars_valid(int char1, int char2) { if (char2 == 0) { - char msg[MB_MAXBYTES + 1]; + char msg[MB_MAXCHAR + 1]; msg[utf_char2bytes(char1, msg)] = NUL; semsg(_(e_digraph_must_be_just_two_characters_str), msg); return false; diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c index 11b4e55c5c..0cfab5cec9 100644 --- a/src/nvim/drawline.c +++ b/src/nvim/drawline.c @@ -228,14 +228,12 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells const char *p = *pp; int cells = utf_ptr2cells(p); int c_len = utfc_ptr2len(p); - int u8c, u8cc[MAX_MCO]; assert(maxcells > 0); if (cells > maxcells) { dest[0] = schar_from_ascii(' '); return 1; } - u8c = utfc_ptr2char(p, u8cc); if (*p == TAB) { cells = MIN(tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array), maxcells); } @@ -247,16 +245,14 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells for (int c = 0; c < cells; c++) { dest[c] = schar_from_ascii(' '); } - goto done; - } else if ((uint8_t)(*p) < 0x80 && u8cc[0] == 0) { - dest[0] = schar_from_ascii(*p); } else { - dest[0] = schar_from_cc(u8c, u8cc); - } - if (cells > 1) { - dest[1] = 0; + int u8c; + dest[0] = utfc_ptr2schar(p, &u8c); + if (cells > 1) { + dest[1] = 0; + } } -done: + *pp += c_len; return cells; } @@ -946,16 +942,6 @@ static void handle_inline_virtual_text(win_T *wp, winlinevars_T *wlv, ptrdiff_t } } -static bool check_mb_utf8(int *c, int *u8cc) -{ - if (utf_char2len(*c) > 1) { - *u8cc = 0; - *c = 0xc0; - return true; - } - return false; -} - static colnr_T get_trailcol(win_T *wp, const char *ptr, const char *line) { colnr_T trailcol = MAXCOL; @@ -1051,7 +1037,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl { winlinevars_T wlv; // variables passed between functions - int c = 0; // init for GCC colnr_T vcol_prev = -1; // "wlv.vcol" of previous character char *line; // current line char *ptr; // current position in "line" @@ -1096,8 +1081,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl int multi_attr = 0; // attributes desired by multibyte int mb_l = 1; // multi-byte byte length int mb_c = 0; // decoded multi-byte character - bool mb_utf8 = false; // screen char is UTF-8 char - int u8cc[MAX_MCO]; // composing UTF-8 chars + schar_T mb_schar; // complete screen char int change_start = MAXCOL; // first col of changed area int change_end = -1; // last col of changed area bool in_multispace = false; // in multiple consecutive spaces @@ -1951,34 +1935,25 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // For the '$' of the 'list' option, n_extra == 1, p_extra == "". if (wlv.n_extra > 0) { if (wlv.c_extra != NUL || (wlv.n_extra == 1 && wlv.c_final != NUL)) { - c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra; - mb_c = c; // doesn't handle non-utf-8 multi-byte! - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra; + mb_schar = schar_from_char(mb_c); + wlv.n_extra--; } else { assert(wlv.p_extra != NULL); - c = (uint8_t)(*wlv.p_extra); - mb_c = c; - // If the UTF-8 character is more than one byte: - // Decode it into "mb_c". mb_l = utfc_ptr2len(wlv.p_extra); - mb_utf8 = false; - if (mb_l > wlv.n_extra) { - mb_l = 1; - } else if (mb_l > 1) { - mb_c = utfc_ptr2char(wlv.p_extra, u8cc); - mb_utf8 = true; - c = 0xc0; - } - if (mb_l == 0) { // at the NUL at end-of-line + mb_schar = utfc_ptr2schar(wlv.p_extra, &mb_c); + // mb_l=0 at the end-of-line NUL + if (mb_l > wlv.n_extra || mb_l == 0) { mb_l = 1; } // If a double-width char doesn't fit display a '>' in the last column. + // Don't advance the pointer but put the character at the start of the next line. if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { - c = '>'; - mb_c = c; + mb_c = '>'; mb_l = 1; (void)mb_l; + mb_schar = schar_from_ascii(mb_c); multi_attr = win_hl_attr(wp, HLF_AT); if (wlv.cul_attr) { @@ -1986,18 +1961,11 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl ? hl_combine_attr(wlv.cul_attr, multi_attr) : hl_combine_attr(multi_attr, wlv.cul_attr); } - - // put the pointer back to output the double-width - // character at the start of the next line. - wlv.n_extra++; - wlv.p_extra--; } else { - wlv.n_extra -= mb_l - 1; - wlv.p_extra += mb_l - 1; + wlv.n_extra -= mb_l; + wlv.p_extra += mb_l; } - wlv.p_extra++; } - wlv.n_extra--; // Only restore search_attr and area_attr after "n_extra" in // the next screen line is also done. @@ -2026,58 +1994,40 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl } } else if (has_fold) { // skip writing the buffer line itself - c = NUL; + mb_c = NUL; } else { - int c0; char *prev_ptr = ptr; - // Get a character from the line itself. - c0 = c = (uint8_t)(*ptr); - mb_c = c; - - if (c == NUL) { + // first byte of next char + int c0 = (uint8_t)(*ptr); + if (c0 == NUL) { // no more cells to skip wlv.skip_cells = 0; } - // If the UTF-8 character is more than one byte: Decode it - // into "mb_c". + // Get a character from the line itself. mb_l = utfc_ptr2len(ptr); - mb_utf8 = false; - if (mb_l > 1) { - mb_c = utfc_ptr2char(ptr, u8cc); - // Overlong encoded ASCII or ASCII with composing char - // is displayed normally, except a NUL. - if (mb_c < 0x80) { - c0 = c = mb_c; - } - mb_utf8 = true; - - // At start of the line we can have a composing char. - // Draw it as a space with a composing char. - if (utf_iscomposing(mb_c)) { - for (int i = MAX_MCO - 1; i > 0; i--) { - u8cc[i] = u8cc[i - 1]; - } - u8cc[0] = mb_c; - mb_c = ' '; - } + mb_schar = utfc_ptr2schar(ptr, &mb_c); + + // Overlong encoded ASCII or ASCII with composing char + // is displayed normally, except a NUL. + if (mb_l > 1 && mb_c < 0x80) { + c0 = mb_c; } - if ((mb_l == 1 && c >= 0x80) + if ((mb_l == 1 && c0 >= 0x80) || (mb_l >= 1 && mb_c == 0) || (mb_l > 1 && (!vim_isprintc(mb_c)))) { // Illegal UTF-8 byte: display as <xx>. - // Non-BMP character : display as ? or fullwidth ?. + // Non-printable character : display as ? or fullwidth ?. transchar_hex(wlv.extra, mb_c); if (wp->w_p_rl) { // reverse rl_mirror_ascii(wlv.extra, NULL); } wlv.p_extra = wlv.extra; - c = (uint8_t)(*wlv.p_extra); mb_c = mb_ptr2char_adv((const char **)&wlv.p_extra); - mb_utf8 = (c >= 0x80); + mb_schar = schar_from_char(mb_c); wlv.n_extra = (int)strlen(wlv.p_extra); wlv.c_extra = NUL; wlv.c_final = NUL; @@ -2093,10 +2043,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // last column; the character is displayed at the start of the // next line. if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { - c = '>'; - mb_c = c; - mb_utf8 = false; + mb_c = '>'; mb_l = 1; + mb_schar = schar_from_ascii(mb_c); multi_attr = win_hl_attr(wp, HLF_AT); // Put pointer back so that the character will be // displayed at the start of the next line. @@ -2112,15 +2061,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_extra = 1; wlv.c_extra = MB_FILLER_CHAR; wlv.c_final = NUL; - c = ' '; + mb_c = ' '; + mb_l = 1; + mb_schar = schar_from_ascii(mb_c); if (area_attr == 0 && search_attr == 0) { wlv.n_attr = wlv.n_extra + 1; wlv.extra_attr = win_hl_attr(wp, HLF_AT); saved_attr2 = wlv.char_attr; // save current attr } - mb_c = c; - mb_utf8 = false; - mb_l = 1; } ptr++; @@ -2159,11 +2107,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // no concealing past the end of the line, it interferes // with line highlighting. - if (c == NUL) { - syntax_flags = 0; - } else { - syntax_flags = get_syntax_info(&syntax_seqnr); - } + syntax_flags = (mb_c == 0) ? 0 : get_syntax_info(&syntax_seqnr); } if (has_decor && v > 0) { @@ -2198,7 +2142,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl spell_attr = 0; // do not calculate cap_col at the end of the line or when // only white space is following - if (c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) { + if (mb_c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) { char *p; hlf_T spell_hlf = HLF_COUNT; v -= mb_l - 1; @@ -2272,13 +2216,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // // So only allow to linebreak, once we have found chars not in // 'breakat' in the line. - if (wp->w_p_lbr && !wlv.need_lbr && c != NUL + if (wp->w_p_lbr && !wlv.need_lbr && mb_c != NUL && !vim_isbreak((uint8_t)(*ptr))) { wlv.need_lbr = true; } // Found last space before word: check for line break. - if (wp->w_p_lbr && c0 == c && wlv.need_lbr - && vim_isbreak(c) && !vim_isbreak((uint8_t)(*ptr))) { + if (wp->w_p_lbr && c0 == mb_c && mb_c < 128 && wlv.need_lbr + && vim_isbreak(mb_c) && !vim_isbreak((uint8_t)(*ptr))) { int mb_off = utf_head_off(line, ptr - 1); char *p = ptr - (mb_off + 1); chartabsize_T cts; @@ -2289,33 +2233,33 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_extra = win_lbr_chartabsize(&cts, NULL) - 1; clear_chartabsize_arg(&cts); - if (on_last_col && c != TAB) { + if (on_last_col && mb_c != TAB) { // Do not continue search/match highlighting over the // line break, but for TABs the highlighting should // include the complete width of the character search_attr = 0; } - if (c == TAB && wlv.n_extra + wlv.col > grid->cols) { + if (mb_c == TAB && wlv.n_extra + wlv.col > grid->cols) { wlv.n_extra = tabstop_padding(wlv.vcol, wp->w_buffer->b_p_ts, wp->w_buffer->b_p_vts_array) - 1; } wlv.c_extra = mb_off > 0 ? MB_FILLER_CHAR : ' '; wlv.c_final = NUL; - if (ascii_iswhite(c)) { - if (c == TAB) { + if (mb_c < 128 && ascii_iswhite(mb_c)) { + if (mb_c == TAB) { // See "Tab alignment" below. FIX_FOR_BOGUSCOLS; } if (!wp->w_p_list) { - c = ' '; + mb_c = ' '; + mb_schar = schar_from_ascii(mb_c); } } } if (wp->w_p_list) { - in_multispace = c == ' ' && (*ptr == ' ' - || (prev_ptr > line && prev_ptr[-1] == ' ')); + in_multispace = mb_c == ' ' && (*ptr == ' ' || (prev_ptr > line && prev_ptr[-1] == ' ')); if (!in_multispace) { multispace_pos = 0; } @@ -2325,61 +2269,56 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // But not when the character is followed by a composing // character (use mb_l to check that). if (wp->w_p_list - && ((((c == 160 && mb_l == 1) - || (mb_utf8 - && ((mb_c == 160 && mb_l == 2) - || (mb_c == 0x202f && mb_l == 3)))) + && ((((mb_c == 160 && mb_l == 2) || (mb_c == 0x202f && mb_l == 3)) && wp->w_p_lcs_chars.nbsp) - || (c == ' ' + || (mb_c == ' ' && mb_l == 1 && (wp->w_p_lcs_chars.space || (in_multispace && wp->w_p_lcs_chars.multispace != NULL)) && ptr - line >= leadcol && ptr - line <= trailcol))) { if (in_multispace && wp->w_p_lcs_chars.multispace != NULL) { - c = wp->w_p_lcs_chars.multispace[multispace_pos++]; + mb_c = wp->w_p_lcs_chars.multispace[multispace_pos++]; if (wp->w_p_lcs_chars.multispace[multispace_pos] == NUL) { multispace_pos = 0; } } else { - c = (c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp; + mb_c = (mb_c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp; } wlv.n_attr = 1; wlv.extra_attr = win_hl_attr(wp, HLF_0); saved_attr2 = wlv.char_attr; // save current attr - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } - if (c == ' ' && ((trailcol != MAXCOL && ptr > line + trailcol) - || (leadcol != 0 && ptr < line + leadcol))) { + if (mb_c == ' ' && mb_l == 1 && ((trailcol != MAXCOL && ptr > line + trailcol) + || (leadcol != 0 && ptr < line + leadcol))) { if (leadcol != 0 && in_multispace && ptr < line + leadcol && wp->w_p_lcs_chars.leadmultispace != NULL) { - c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++]; + mb_c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++]; if (wp->w_p_lcs_chars.leadmultispace[multispace_pos] == NUL) { multispace_pos = 0; } } else if (ptr > line + trailcol && wp->w_p_lcs_chars.trail) { - c = wp->w_p_lcs_chars.trail; + mb_c = wp->w_p_lcs_chars.trail; } else if (ptr < line + leadcol && wp->w_p_lcs_chars.lead) { - c = wp->w_p_lcs_chars.lead; + mb_c = wp->w_p_lcs_chars.lead; } else if (leadcol != 0 && wp->w_p_lcs_chars.space) { - c = wp->w_p_lcs_chars.space; + mb_c = wp->w_p_lcs_chars.space; } wlv.n_attr = 1; wlv.extra_attr = win_hl_attr(wp, HLF_0); saved_attr2 = wlv.char_attr; // save current attr - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } } // Handling of non-printable characters. - if (!vim_isprintc(c)) { + if (!vim_isprintc(mb_c)) { // when getting a character from the file, we may have to // turn it into something else on the way to putting it on the screen. - if (c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) { + if (mb_c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) { int tab_len = 0; colnr_T vcol_adjusted = wlv.vcol; // removed showbreak length char *const sbr = get_showbreak_value(wp); @@ -2422,7 +2361,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl if (wlv.n_extra > 0) { len += wlv.n_extra - tab_len; } - c = wp->w_p_lcs_chars.tab1; + mb_c = wp->w_p_lcs_chars.tab1; p = get_extra_buf((size_t)len + 1); memset(p, ' ', (size_t)len); p[len] = NUL; @@ -2470,11 +2409,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl } } - mb_utf8 = false; // don't draw as UTF-8 if (wp->w_p_list) { - c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3) - ? wp->w_p_lcs_chars.tab3 - : wp->w_p_lcs_chars.tab1; + mb_c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3) + ? wp->w_p_lcs_chars.tab3 : wp->w_p_lcs_chars.tab1; if (wp->w_p_lbr && wlv.p_extra != NULL && *wlv.p_extra != NUL) { wlv.c_extra = NUL; // using p_extra from above } else { @@ -2484,14 +2421,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_attr = tab_len + 1; wlv.extra_attr = win_hl_attr(wp, HLF_0); saved_attr2 = wlv.char_attr; // save current attr - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); } else { wlv.c_final = NUL; wlv.c_extra = ' '; - c = ' '; + mb_c = ' '; } - } else if (c == NUL + mb_schar = schar_from_char(mb_c); + } else if (mb_c == NUL && (wp->w_p_list || ((wlv.fromcol >= 0 || fromcol_prev >= 0) && wlv.tocol > wlv.vcol @@ -2515,20 +2451,19 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_extra = 0; } if (wp->w_p_list && wp->w_p_lcs_chars.eol > 0) { - c = wp->w_p_lcs_chars.eol; + mb_c = wp->w_p_lcs_chars.eol; } else { - c = ' '; + mb_c = ' '; } lcs_eol_one = -1; ptr--; // put it back at the NUL wlv.extra_attr = win_hl_attr(wp, HLF_AT); wlv.n_attr = 1; - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); - } else if (c != NUL) { - wlv.p_extra = transchar_buf(wp->w_buffer, c); + mb_schar = schar_from_char(mb_c); + } else if (mb_c != NUL) { + wlv.p_extra = transchar_buf(wp->w_buffer, mb_c); if (wlv.n_extra == 0) { - wlv.n_extra = byte2cells(c) - 1; + wlv.n_extra = byte2cells(mb_c) - 1; } if ((dy_flags & DY_UHEX) && wp->w_p_rl) { rl_mirror_ascii(wlv.p_extra, NULL); // reverse "<12>" @@ -2538,7 +2473,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl if (wp->w_p_lbr) { char *p; - c = (uint8_t)(*wlv.p_extra); + mb_c = (uint8_t)(*wlv.p_extra); p = get_extra_buf((size_t)wlv.n_extra + 1); memset(p, ' ', (size_t)wlv.n_extra); strncpy(p, // NOLINT(runtime/printf) @@ -2547,20 +2482,21 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl p[wlv.n_extra] = NUL; wlv.p_extra = p; } else { - wlv.n_extra = byte2cells(c) - 1; - c = (uint8_t)(*wlv.p_extra++); + wlv.n_extra = byte2cells(mb_c) - 1; + mb_c = (uint8_t)(*wlv.p_extra++); } wlv.n_attr = wlv.n_extra + 1; wlv.extra_attr = win_hl_attr(wp, HLF_8); saved_attr2 = wlv.char_attr; // save current attr - mb_utf8 = false; // don't draw as UTF-8 + mb_schar = schar_from_ascii(mb_c); } else if (VIsual_active && (VIsual_mode == Ctrl_V || VIsual_mode == 'v') && virtual_active() && wlv.tocol != MAXCOL && wlv.vcol < wlv.tocol && wlv.col < grid->cols) { - c = ' '; + mb_c = ' '; + mb_schar = schar_from_char(mb_c); ptr--; // put it back at the NUL } } @@ -2580,18 +2516,18 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // First time at this concealed item: display one // character. if (has_match_conc && match_conc) { - c = match_conc; + mb_c = match_conc; } else if (decor_conceal && decor_state.conceal_char) { - c = decor_state.conceal_char; + mb_c = decor_state.conceal_char; if (decor_state.conceal_attr) { wlv.char_attr = decor_state.conceal_attr; } } else if (syn_get_sub_char() != NUL) { - c = syn_get_sub_char(); + mb_c = syn_get_sub_char(); } else if (wp->w_p_lcs_chars.conceal != NUL) { - c = wp->w_p_lcs_chars.conceal; + mb_c = wp->w_p_lcs_chars.conceal; } else { - c = ' '; + mb_c = ' '; } prev_syntax_id = syntax_seqnr; @@ -2610,8 +2546,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl is_concealing = true; wlv.skip_cells = 1; } - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } else { prev_syntax_id = 0; is_concealing = false; @@ -2654,8 +2589,8 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl && (wp->w_p_wrap ? (wp->w_skipcol > 0 && wlv.row == 0) : wp->w_leftcol > 0) && wlv.filler_todo <= 0 && wlv.draw_state > WL_STC - && c != NUL) { - c = wp->w_p_lcs_chars.prec; + && mb_c != NUL) { + mb_c = wp->w_p_lcs_chars.prec; lcs_prec_todo = NUL; if (utf_char2cells(mb_c) > 1) { // Double-width character being overwritten by the "precedes" @@ -2666,15 +2601,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_attr = 2; wlv.extra_attr = win_hl_attr(wp, HLF_AT); } - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); saved_attr3 = wlv.char_attr; // save current attr wlv.char_attr = win_hl_attr(wp, HLF_AT); // overwriting char_attr n_attr3 = 1; } // At end of the text line or just after the last character. - if (c == NUL && eol_hl_off == 0) { + if (mb_c == NUL && eol_hl_off == 0) { // flag to indicate whether prevcol equals startcol of search_hl or // one of the matches bool prevcol_hl_flag = get_prevcol_hl_flag(wp, &screen_search_hl, @@ -2728,7 +2662,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl } // At end of the text line. - if (c == NUL) { + if (mb_c == NUL) { // Highlight 'cursorcolumn' & 'colorcolumn' past end of the line. if (wp->w_p_wrap) { v = wlv.startrow == 0 ? wp->w_skipcol : 0; @@ -2874,10 +2808,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl || lcs_eol_one > 0 || (wlv.n_extra > 0 && (wlv.c_extra != NUL || *wlv.p_extra != NUL)) || has_more_inline_virt(&wlv, v)) { - c = wp->w_p_lcs_chars.ext; + mb_c = wp->w_p_lcs_chars.ext; wlv.char_attr = win_hl_attr(wp, HLF_AT); - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } } @@ -2923,11 +2856,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // Skip characters that are left of the screen for 'nowrap'. if (wlv.draw_state < WL_LINE || wlv.skip_cells <= 0) { // Store the character. - if (mb_utf8) { - linebuf_char[wlv.off] = schar_from_cc(mb_c, u8cc); - } else { - linebuf_char[wlv.off] = schar_from_ascii((char)c); - } + linebuf_char[wlv.off] = mb_schar; if (multi_attr) { linebuf_attr[wlv.off] = multi_attr; multi_attr = 0; diff --git a/src/nvim/edit.c b/src/nvim/edit.c index ce547b55fe..eb5ea2c873 100644 --- a/src/nvim/edit.c +++ b/src/nvim/edit.c @@ -1462,7 +1462,7 @@ void edit_putchar(int c, bool highlight) pc_status = PC_STATUS_SET; } - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; grid_line_puts(pc_col, buf, utf_char2bytes(c, buf), attr); grid_line_flush(); } @@ -2176,7 +2176,7 @@ void insertchar(int c, int flags, int second_indent) int cc; if ((cc = utf_char2len(c)) > 1) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; utf_char2bytes(c, buf); buf[cc] = NUL; @@ -3681,7 +3681,6 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) int cc; int temp = 0; // init for GCC bool did_backspace = false; - int cpc[MAX_MCO]; // composing characters bool call_fix_indent = false; // can't delete anything in an empty file @@ -3910,15 +3909,15 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) if (State & REPLACE_FLAG) { replace_do_bs(-1); } else { - const int l_p_deco = p_deco; - if (l_p_deco) { - (void)utfc_ptr2char(get_cursor_pos_ptr(), cpc); + bool has_composing = false; + if (p_deco) { + char *p0 = get_cursor_pos_ptr(); + has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0)); } (void)del_char(false); // If there are combining characters and 'delcombine' is set - // move the cursor back. Don't back up before the base - // character. - if (l_p_deco && cpc[0] != NUL) { + // move the cursor back. Don't back up before the base character. + if (has_composing) { inc_cursor(); } if (revins_chars) { diff --git a/src/nvim/eval.c b/src/nvim/eval.c index ed70091077..c073f30547 100644 --- a/src/nvim/eval.c +++ b/src/nvim/eval.c @@ -7117,7 +7117,7 @@ dict_T *get_vim_var_dict(int idx) FUNC_ATTR_PURE /// Set v:char to character "c". void set_vim_var_char(int c) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; buf[utf_char2bytes(c, buf)] = NUL; set_vim_var_string(VV_CHAR, buf, -1); diff --git a/src/nvim/eval/funcs.c b/src/nvim/eval/funcs.c index c6909245af..8ef208f291 100644 --- a/src/nvim/eval/funcs.c +++ b/src/nvim/eval/funcs.c @@ -5134,7 +5134,7 @@ static void f_nr2char(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) return; } - char buf[MB_MAXBYTES]; + char buf[MB_MAXCHAR]; const int len = utf_char2bytes((int)num, buf); rettv->v_type = VAR_STRING; @@ -6891,7 +6891,7 @@ static void f_screenchar(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) { c = -1; } else { - char buf[MB_MAXBYTES + 1]; + char buf[MAX_SCHAR_SIZE + 1]; schar_get(buf, grid_getchar(grid, row, col, NULL)); c = utf_ptr2char(buf); } @@ -6907,24 +6907,22 @@ static void f_screenchars(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) ScreenGrid *grid; screenchar_adjust(&grid, &row, &col); + tv_list_alloc_ret(rettv, kListLenMayKnow); if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) { - tv_list_alloc_ret(rettv, 0); return; } - char buf[MB_MAXBYTES + 1]; + char buf[MAX_SCHAR_SIZE + 1]; schar_get(buf, grid_getchar(grid, row, col, NULL)); - int pcc[MAX_MCO]; - int c = utfc_ptr2char(buf, pcc); - int composing_len = 0; - while (composing_len < MAX_MCO && pcc[composing_len] != 0) { - composing_len++; - } - tv_list_alloc_ret(rettv, composing_len + 1); - tv_list_append_number(rettv->vval.v_list, c); - for (int i = 0; i < composing_len; i++) { - tv_list_append_number(rettv->vval.v_list, pcc[i]); - } + + // schar values are already processed chars which are always NUL-terminated. + // A single [0] is expected when char is NUL. + size_t i = 0; + do { + int c = utf_ptr2char(buf + i); + tv_list_append_number(rettv->vval.v_list, c); + i += (size_t)utf_ptr2len(buf + i); + } while (buf[i] != NUL); } /// "screencol()" function @@ -6957,7 +6955,7 @@ static void f_screenstring(typval_T *argvars, typval_T *rettv, EvalFuncData fptr return; } - char buf[MB_MAXBYTES + 1]; + char buf[MAX_SCHAR_SIZE + 1]; schar_get(buf, grid_getchar(grid, row, col, NULL)); rettv->vval.v_string = xstrdup(buf); } @@ -7413,8 +7411,7 @@ static void f_setcharsearch(typval_T *argvars, typval_T *rettv, EvalFuncData fpt char *const csearch = tv_dict_get_string(d, "char", false); if (csearch != NULL) { - int pcc[MAX_MCO]; - const int c = utfc_ptr2char(csearch, pcc); + int c = utf_ptr2char(csearch); set_last_csearch(c, csearch, utfc_ptr2len(csearch)); } diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c index 692b320335..d92be6404b 100644 --- a/src/nvim/ex_cmds.c +++ b/src/nvim/ex_cmds.c @@ -131,17 +131,22 @@ static const char e_non_numeric_argument_to_z[] /// ":ascii" and "ga" implementation void do_ascii(exarg_T *eap) { - char *dig; - int cc[MAX_MCO]; - int c = utfc_ptr2char(get_cursor_pos_ptr(), cc); - if (c == NUL) { + char *data = get_cursor_pos_ptr(); + size_t len = (size_t)utfc_ptr2len(data); + + if (len == 0) { msg("NUL", 0); return; } - size_t iobuff_len = 0; + bool need_clear = true; + msg_sb_eol(); + msg_start(); + + int c = utf_ptr2char(data); + size_t off = 0; - int ci = 0; + // TODO(bfredl): merge this with the main loop if (c < 0x80) { if (c == NL) { // NUL is stored as NL. c = NUL; @@ -160,46 +165,29 @@ void do_ascii(exarg_T *eap) char buf2[20]; buf2[0] = NUL; - dig = get_digraph_for_char(cval); + char *dig = get_digraph_for_char(cval); if (dig != NULL) { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - _("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"), - transchar(c), buf1, buf2, cval, cval, cval, dig); + vim_snprintf(IObuff, sizeof(IObuff), + _("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"), + transchar(c), buf1, buf2, cval, cval, cval, dig); } else { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - _("<%s>%s%s %d, Hex %02x, Octal %03o"), - transchar(c), buf1, buf2, cval, cval, cval); - } - - c = cc[ci++]; - } - -#define SPACE_FOR_DESC (1 + 1 + 1 + MB_MAXBYTES + 16 + 4 + 3 + 3 + 1) - // Space for description: - // - 1 byte for separator (starting from second entry) - // - 1 byte for "<" - // - 1 byte for space to draw composing character on (optional, but really - // mostly required) - // - up to MB_MAXBYTES bytes for character itself - // - 16 bytes for raw text ("> , Hex , Octal "). - // - at least 4 bytes for hexadecimal representation - // - at least 3 bytes for decimal representation - // - at least 3 bytes for octal representation - // - 1 byte for NUL - // - // Taking into account MAX_MCO and characters which need 8 bytes for - // hexadecimal representation, but not taking translation into account: - // resulting string will occupy less then 400 bytes (conservative estimate). - // - // Less then 1000 bytes if translation multiplies number of bytes needed for - // raw text by 6, so it should always fit into 1025 bytes reserved for IObuff. + vim_snprintf(IObuff, sizeof(IObuff), + _("<%s>%s%s %d, Hex %02x, Octal %03o"), + transchar(c), buf1, buf2, cval, cval, cval); + } + + msg_multiline(IObuff, 0, true, &need_clear); + + off += (size_t)utf_ptr2len(data); // needed for overlong ascii? + } // Repeat for combining characters, also handle multiby here. - while (c >= 0x80 && iobuff_len < sizeof(IObuff) - SPACE_FOR_DESC) { + while (off < len) { + c = utf_ptr2char(data + off); + + size_t iobuff_len = 0; // This assumes every multi-byte char is printable... - if (iobuff_len > 0) { + if (off > 0) { IObuff[iobuff_len++] = ' '; } IObuff[iobuff_len++] = '<'; @@ -208,32 +196,30 @@ void do_ascii(exarg_T *eap) } iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len); - dig = get_digraph_for_char(c); + char *dig = get_digraph_for_char(c); if (dig != NULL) { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - (c < 0x10000 - ? _("> %d, Hex %04x, Oct %o, Digr %s") - : _("> %d, Hex %08x, Oct %o, Digr %s")), - c, c, c, dig); + vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len, + (c < 0x10000 + ? _("> %d, Hex %04x, Oct %o, Digr %s") + : _("> %d, Hex %08x, Oct %o, Digr %s")), + c, c, c, dig); } else { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - (c < 0x10000 - ? _("> %d, Hex %04x, Octal %o") - : _("> %d, Hex %08x, Octal %o")), - c, c, c); - } - if (ci == MAX_MCO) { - break; + vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len, + (c < 0x10000 + ? _("> %d, Hex %04x, Octal %o") + : _("> %d, Hex %08x, Octal %o")), + c, c, c); } - c = cc[ci++]; - } - if (ci != MAX_MCO && c != 0) { - xstrlcpy(IObuff + iobuff_len, " ...", sizeof(IObuff) - iobuff_len); + + msg_multiline(IObuff, 0, true, &need_clear); + + off += (size_t)utf_ptr2len(data + off); // needed for overlong ascii? } - msg(IObuff, 0); + if (need_clear) { + msg_clr_eos(); + } + msg_end(); } /// ":left", ":center" and ":right": align text. diff --git a/src/nvim/grid.c b/src/nvim/grid.c index f21b7e3a90..6320abe4ea 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -68,21 +68,6 @@ void grid_adjust(ScreenGrid **grid, int *row_off, int *col_off) } } -/// Put a unicode char, and up to MAX_MCO composing chars, in a screen cell. -schar_T schar_from_cc(int c, int u8cc[MAX_MCO]) -{ - char buf[MAX_SCHAR_SIZE]; - int len = utf_char2bytes(c, buf); - for (int i = 0; i < MAX_MCO; i++) { - if (u8cc[i] == 0) { - break; - } - len += utf_char2bytes(u8cc[i], buf + len); - } - buf[len] = 0; - return schar_from_buf(buf, (size_t)len); -} - schar_T schar_from_str(char *str) { if (str == NULL) { @@ -243,22 +228,21 @@ void line_do_arabic_shape(schar_T *buf, int cols) schar_get(scbuf, buf[i]); char scbuf_new[MAX_SCHAR_SIZE]; - int len = utf_char2bytes(c0new, scbuf_new); + size_t len = (size_t)utf_char2bytes(c0new, scbuf_new); if (c1new) { - len += utf_char2bytes(c1new, scbuf_new + len); + len += (size_t)utf_char2bytes(c1new, scbuf_new + len); } int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0); size_t rest = strlen(scbuf + off); - if (rest + (size_t)off + 1 > MAX_SCHAR_SIZE) { - // TODO(bfredl): this cannot happen just yet, as we only construct - // schar_T values with up to MAX_MCO+1 composing codepoints. When code - // is improved so that MAX_SCHAR_SIZE becomes the only/sharp limit, - // we need be able to peel off a composing char which doesn't fit anymore. - abort(); + if (rest + len + 1 > MAX_SCHAR_SIZE) { + // Too bigly, discard one code-point. + // This should be enough as c0 cannot grow more than from 2 to 4 bytes + // (base arabic to extended arabic) + rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1; } memcpy(scbuf_new + len, scbuf + off, rest); - buf[i] = schar_from_buf(scbuf_new, (size_t)len + rest); + buf[i] = schar_from_buf(scbuf_new, len + rest); next: c0prev = c0; @@ -289,9 +273,9 @@ static bool grid_invalid_row(ScreenGrid *grid, int row) return grid->attrs[grid->line_offset[row]] < 0; } -/// Get a single character directly from grid.chars into "bytes", which must -/// have a size of "MB_MAXBYTES + 1". -/// If "attrp" is not NULL, return the character's attribute in "*attrp". +/// Get a single character directly from grid.chars +/// +/// @param[out] attrp set to the character's attribute (optional) schar_T grid_getchar(ScreenGrid *grid, int row, int col, int *attrp) { grid_adjust(&grid, &row, &col); @@ -385,42 +369,35 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) { const char *ptr = text; int len = textlen; - int u8cc[MAX_MCO]; assert(grid_line_grid); int start_col = col; int max_col = grid_line_maxcol; - while (col < max_col - && (len < 0 || (int)(ptr - text) < len) - && *ptr != NUL) { + while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) { // check if this is the first byte of a multibyte int mbyte_blen = len > 0 ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr)) : utfc_ptr2len(ptr); - int u8c = len >= 0 - ? utfc_ptr2char_len(ptr, u8cc, (int)((text + len) - ptr)) - : utfc_ptr2char(ptr, u8cc); - int mbyte_cells = utf_char2cells(u8c); + int firstc; + schar_T schar = len >= 0 + ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc) + : utfc_ptr2schar(ptr, &firstc); + int mbyte_cells = utf_char2cells(firstc); if (mbyte_cells > 2) { mbyte_cells = 1; - u8c = 0xFFFD; - u8cc[0] = 0; + + schar = schar_from_char(0xFFFD); } if (col + mbyte_cells > max_col) { // Only 1 cell left, but character requires 2 cells: // display a '>' in the last column to avoid wrapping. */ - u8c = '>'; - u8cc[0] = 0; + schar = schar_from_ascii('>'); mbyte_cells = 1; } - schar_T buf; - // TODO(bfredl): why not just keep the original byte sequence. - buf = schar_from_cc(u8c, u8cc); - // When at the start of the text and overwriting the right half of a // two-cell character in the same grid, truncate that into a '>'. if (ptr == text && col > grid_line_first && col < grid_line_last @@ -428,7 +405,7 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) linebuf_char[col - 1] = schar_from_ascii('>'); } - linebuf_char[col] = buf; + linebuf_char[col] = schar; linebuf_attr[col] = attr; linebuf_vcol[col] = -1; if (mbyte_cells == 2) { diff --git a/src/nvim/grid_defs.h b/src/nvim/grid_defs.h index 11e736fc0c..3cc2d788d3 100644 --- a/src/nvim/grid_defs.h +++ b/src/nvim/grid_defs.h @@ -7,8 +7,8 @@ #include "nvim/pos.h" #include "nvim/types.h" -#define MAX_MCO 6 // fixed value for 'maxcombine' -// Includes final NUL. at least 4*(MAX_MCO+1)+1 +// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29 +// ensures we can fit all composed chars which did fit before. #define MAX_SCHAR_SIZE 32 // if data[0] is 0xFF, then data[1..4] is a 24-bit index (in machine endianness) @@ -35,7 +35,7 @@ enum { /// we can avoid sending bigger updates than necessary to the Ul layer. /// /// Screen cells are stored as NUL-terminated UTF-8 strings, and a cell can -/// contain up to MAX_MCO composing characters after the base character. +/// contain composing characters as many as fits in MAX_SCHAR_SIZE-1 bytes /// The composing characters are to be drawn on top of the original character. /// The content after the NUL is not defined (so comparison must be done a /// single cell at a time). Double-width characters are stored in the left cell, diff --git a/src/nvim/insexpand.c b/src/nvim/insexpand.c index f565d5b9e8..adbd2a5315 100644 --- a/src/nvim/insexpand.c +++ b/src/nvim/insexpand.c @@ -1743,7 +1743,7 @@ void ins_compl_addleader(int c) return; } if ((cc = utf_char2len(c)) > 1) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; utf_char2bytes(c, buf); buf[cc] = NUL; diff --git a/src/nvim/lua/stdlib.c b/src/nvim/lua/stdlib.c index 5072d14c0e..a200b0a32f 100644 --- a/src/nvim/lua/stdlib.c +++ b/src/nvim/lua/stdlib.c @@ -224,7 +224,7 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL if (offset < 0 || offset > (intptr_t)s1_len) { return luaL_error(lstate, "index out of range"); } - int head_offset = utf_cp_head_off(s1, s1 + offset - 1); + int head_offset = -utf_cp_head_off(s1, s1 + offset - 1); lua_pushinteger(lstate, head_offset); return 1; } diff --git a/src/nvim/match.c b/src/nvim/match.c index 3420455e5f..0cd0426cff 100644 --- a/src/nvim/match.c +++ b/src/nvim/match.c @@ -939,7 +939,7 @@ void f_getmatches(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) tv_dict_add_nr(dict, S_LEN("id"), (varnumber_T)cur->mit_id); if (cur->mit_conceal_char) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; buf[utf_char2bytes(cur->mit_conceal_char, buf)] = NUL; tv_dict_add_str(dict, S_LEN("conceal"), buf); diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 0d468889a4..3a13aeddb8 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -48,6 +48,7 @@ #include "nvim/getchar.h" #include "nvim/gettext.h" #include "nvim/globals.h" +#include "nvim/grid.h" #include "nvim/grid_defs.h" #include "nvim/iconv.h" #include "nvim/keycodes.h" @@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2) return arabic_combine(utf_ptr2char(p1), c2); } -/// Convert a UTF-8 string to a wide character +/// Get the screen char at the beginning of a string /// -/// Also gets up to #MAX_MCO composing characters. +/// Caller is expected to check for things like unprintable chars etc +/// If first char in string is a composing char, prepend a space to display it correctly. /// -/// @param[out] pcc Location where to store composing characters. Must have -/// space at least for #MAX_MCO + 1 elements. +/// If "p" starts with an invalid sequence, zero is returned. /// -/// @return leading character. -int utfc_ptr2char(const char *p, int *pcc) +/// @param[out] firstc (required) The first codepoint of the screen char, +/// or the first byte of an invalid sequence +/// +/// @return the char +schar_T utfc_ptr2schar(const char *p, int *firstc) + FUNC_ATTR_NONNULL_ALL { - int i = 0; - int c = utf_ptr2char(p); - int len = utf_ptr2len(p); - - // Only accept a composing char when the first char isn't illegal. - if ((len > 1 || (uint8_t)(*p) < 0x80) - && (uint8_t)p[len] >= 0x80 - && utf_composinglike(p, p + len)) { - int cc = utf_ptr2char(p + len); - while (true) { - pcc[i++] = cc; - if (i == MAX_MCO) { - break; - } - len += utf_ptr2len(p + len); - if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) { - break; - } - } - } + *firstc = c; // NOT optional, you are gonna need it + bool first_compose = utf_iscomposing(c); + size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; + size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); - if (i < MAX_MCO) { // last composing char must be 0 - pcc[i] = 0; + if (len == 1 && (uint8_t)(*p) >= 0x80) { + return 0; // invalid sequence } - return c; + return schar_from_buf_first(p, len, first_compose); } -// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO -// composing characters. Use no more than p[maxlen]. -// -// @param [out] pcc: composing chars, last one is 0 -int utfc_ptr2char_len(const char *p, int *pcc, int maxlen) +/// Get the screen char at the beginning of a string with length +/// +/// Like utfc_ptr2schar but use no more than p[maxlen]. +schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) + FUNC_ATTR_NONNULL_ALL { assert(maxlen > 0); - int i = 0; + size_t len = (size_t)utf_ptr2len_len(p, maxlen); + if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { + // invalid or truncated sequence + *firstc = (uint8_t)(*p); + return 0; + } - int len = utf_ptr2len_len(p, maxlen); - // Is it safe to use utf_ptr2char()? - bool safe = len > 1 && len <= maxlen; - int c = safe ? utf_ptr2char(p) : (uint8_t)(*p); + int c = utf_ptr2char(p); + *firstc = c; + bool first_compose = utf_iscomposing(c); + maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose); + len = (size_t)utfc_ptr2len_len(p, maxlen); - // Only accept a composing char when the first char isn't illegal. - if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) { - for (; i < MAX_MCO; i++) { - int len_cc = utf_ptr2len_len(p + len, maxlen - len); - safe = len_cc > 1 && len_cc <= maxlen - len; - if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80 - || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) { - break; - } - len += len_cc; - } - } + return schar_from_buf_first(p, len, first_compose); +} - if (i < MAX_MCO) { - // last composing char must be 0 - pcc[i] = 0; +/// Caller must ensure there is space for `first_compose` +static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose) +{ + if (first_compose) { + char cbuf[MAX_SCHAR_SIZE]; + cbuf[0] = ' '; + memcpy(cbuf + 1, buf, len); + return schar_from_buf(cbuf, len + 1); + } else { + return schar_from_buf(buf, len); } - - return c; -#undef ISCOMPOSING } /// Get the length of a UTF-8 byte sequence representing a single codepoint @@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p) return 1; } - // Check for composing characters. We can handle only the first six, but - // skip all of them (otherwise the cursor would get stuck). + // Check for composing characters. int prevlen = 0; while (true) { if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { @@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in) /// Return the offset from "p" to the first byte of the codepoint it points /// to. Can start anywhere in a stream of bytes. /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters -/// separately and returns a negative offset. +/// separately. /// /// @param[in] base Pointer to start of string /// @param[in] p Pointer to byte for which to return the offset to the previous codepoint // -/// @return 0 if invalid sequence, else offset to previous codepoint +/// @return 0 if invalid sequence, else number of bytes to previous codepoint int utf_cp_head_off(const char *base, const char *p) { int i; @@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p) } // Find the first character that is not 10xx.xxxx - for (i = 0; p - i > base; i--) { - if (((uint8_t)p[i] & 0xc0) != 0x80) { + for (i = 0; p - i >= base; i++) { + if (((uint8_t)p[-i] & 0xc0) != 0x80) { break; } } - // Find the last character that is 10xx.xxxx - for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {} + // Find the last character that is 10xx.xxxx (condition terminates on NUL) + int j = 1; + while (((uint8_t)p[j] & 0xc0) == 0x80) { + j++; + } // Check for illegal sequence. - if (utf8len_tab[(uint8_t)p[i]] == 1) { + if (utf8len_tab[(uint8_t)p[-i]] != j + i) { return 0; } return i; diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h index 1d1a9439ad..c177f14ce2 100644 --- a/src/nvim/mbyte.h +++ b/src/nvim/mbyte.h @@ -7,6 +7,7 @@ #include "nvim/cmdexpand_defs.h" #include "nvim/eval/typval_defs.h" #include "nvim/func_attr.h" +#include "nvim/grid_defs.h" #include "nvim/mbyte_defs.h" #include "nvim/os/os_defs.h" #include "nvim/types.h" diff --git a/src/nvim/message.c b/src/nvim/message.c index ee1a9e60b0..9e9aa1fcd6 100644 --- a/src/nvim/message.c +++ b/src/nvim/message.c @@ -139,7 +139,7 @@ static int msg_grid_pos_at_flush = 0; static void ui_ext_msg_set_pos(int row, bool scrolled) { - char buf[MAX_MCO + 1]; + char buf[MB_MAXCHAR + 1]; size_t size = (size_t)utf_char2bytes(curwin->w_p_fcs_chars.msgsep, buf); buf[size] = '\0'; ui_call_msg_set_pos(msg_grid.handle, row, scrolled, @@ -1471,7 +1471,7 @@ void msg_putchar(int c) void msg_putchar_attr(int c, int attr) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; if (IS_SPECIAL(c)) { buf[0] = (char)K_SPECIAL; @@ -1560,12 +1560,6 @@ int msg_outtrans_len(const char *msgstr, int len, int attr) mode_displayed = false; } - // If the string starts with a composing character first draw a space on - // which the composing char can be drawn. - if (utf_iscomposing(utf_ptr2char(msgstr))) { - msg_puts_attr(" ", attr); - } - // Go over the string. Special characters are translated and printed. // Normal characters are printed several at a time. while (--len >= 0 && !got_int) { diff --git a/src/nvim/option_vars.h b/src/nvim/option_vars.h index f0c752a2b1..0193e43de7 100644 --- a/src/nvim/option_vars.h +++ b/src/nvim/option_vars.h @@ -556,6 +556,7 @@ EXTERN char *p_mp; ///< 'makeprg' EXTERN char *p_mps; ///< 'matchpairs' EXTERN OptInt p_mat; ///< 'matchtime' EXTERN OptInt p_mco; ///< 'maxcombine' +#define MAX_MCO 6 // fixed value for 'maxcombine' EXTERN OptInt p_mfd; ///< 'maxfuncdepth' EXTERN OptInt p_mmd; ///< 'maxmapdepth' EXTERN OptInt p_mmp; ///< 'maxmempattern' diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c index 15a58a4434..dab278e383 100644 --- a/src/nvim/spellsuggest.c +++ b/src/nvim/spellsuggest.c @@ -3019,7 +3019,7 @@ static int soundfold_find(slang_T *slang, char *word) static bool similar_chars(slang_T *slang, int c1, int c2) { int m1, m2; - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; hashitem_T *hi; if (c1 >= 256) { diff --git a/test/functional/ui/fold_spec.lua b/test/functional/ui/fold_spec.lua index 9a0182ea29..1addf7088e 100644 --- a/test/functional/ui/fold_spec.lua +++ b/test/functional/ui/fold_spec.lua @@ -1102,8 +1102,6 @@ describe("folded lines", function() end) it("works with multibyte text", function() - -- Currently the only allowed value of 'maxcombine' - eq(6, meths.get_option_value('maxcombine', {})) eq(true, meths.get_option_value('arabicshape', {})) insert([[ å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢͟ العَرَبِيَّة @@ -1120,7 +1118,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | + å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | möre tex^t | {1:~ }| {1:~ }| @@ -1132,7 +1130,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | + å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | möre tex^t | {1:~ }| {1:~ }| @@ -1156,7 +1154,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1168,7 +1166,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1192,7 +1190,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1204,7 +1202,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1228,7 +1226,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}| + {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}| {1:~ }| {1:~ }| {1:~ }| @@ -1240,7 +1238,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}| + {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}| {1:~ }| {1:~ }| {1:~ }| @@ -1265,7 +1263,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}{8: 1 }{7: +}| + {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}{8: 1 }{7: +}| {1: ~}| {1: ~}| {1: ~}| @@ -1277,7 +1275,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}{8: 1 }{7: +}| + {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}{8: 1 }{7: +}| {1: ~}| {1: ~}| {1: ~}| @@ -1301,7 +1299,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1313,7 +1311,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1337,7 +1335,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1349,7 +1347,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1373,7 +1371,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å| + ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| @@ -1385,7 +1383,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å| + ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| @@ -1409,7 +1407,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å| + ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| @@ -1421,7 +1419,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å| + ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| diff --git a/test/functional/ui/multibyte_spec.lua b/test/functional/ui/multibyte_spec.lua index 077dd1a779..d72bf27d6b 100644 --- a/test/functional/ui/multibyte_spec.lua +++ b/test/functional/ui/multibyte_spec.lua @@ -228,6 +228,36 @@ describe("multibyte rendering", function() ]]} end) + + it('works with arabicshape and multiple composing chars', function() + -- this tests an important edge case: arabicshape might increase the byte size of the base + -- character in a way so that the last composing char no longer fits. use "g8" on the text + -- to observe what is happening (the final E1 80 B7 gets deleted with 'arabicshape') + -- If we would increase the schar_t size, say from 32 to 64 bytes, we need to extend the + -- test text with even more zalgo energy to still touch this edge case. + + meths.buf_set_lines(0,0,-1,true, {"سلام့̀́̂̃̄̅̆̇̈̉̊̋̌"}) + command('set noarabicshape') + + screen:expect{grid=[[ + ^سلام့̀́̂̃̄̅̆̇̈̉̊̋̌ | + {1:~ }| + {1:~ }| + {1:~ }| + {1:~ }| + | + ]]} + + command('set arabicshape') + screen:expect{grid=[[ + ^ﺱﻼﻣ̀́̂̃̄̅̆̇̈̉̊̋̌ | + {1:~ }| + {1:~ }| + {1:~ }| + {1:~ }| + | + ]]} + end) end) describe('multibyte rendering: statusline', function() diff --git a/test/functional/ui/output_spec.lua b/test/functional/ui/output_spec.lua index 0dd1f0325c..7b93b74eac 100644 --- a/test/functional/ui/output_spec.lua +++ b/test/functional/ui/output_spec.lua @@ -225,8 +225,8 @@ describe("shell command :!", function() å | ref: å̲ | 1: å̲ | - 2: å ̲ | - 3: å ̲ | + 2: å ̲ | + 3: å ̲ | | {3:Press ENTER or type command to continue}^ | ]]) diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index fdb1bceab0..cd94624570 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it) local ffi = helpers.ffi local eq = helpers.eq -local mbyte = helpers.cimport("./src/nvim/mbyte.h") -local charset = helpers.cimport('./src/nvim/charset.h') +local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h') describe('mbyte', function() - -- Array for composing characters - local intp = ffi.typeof('int[?]') - local function to_intp() - -- how to get MAX_MCO from globals.h? - return intp(7, 1) - end - -- Convert from bytes to string local function to_string(bytes) local s = {} @@ -30,14 +22,14 @@ describe('mbyte', function() itp('utf_ptr2char', function() -- For strings with length 1 the first byte is returned. for c = 0, 255 do - eq(c, mbyte.utf_ptr2char(to_string({c, 0}))) + eq(c, lib.utf_ptr2char(to_string({c, 0}))) end -- Some ill formed byte sequences that should not be recognized as UTF-8 -- First byte: 0xc0 or 0xc1 -- Second byte: 0x80 .. 0xbf - --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80}))) - --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf}))) + --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80}))) + --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf}))) -- -- Sequences with more than four bytes end) @@ -47,240 +39,133 @@ describe('mbyte', function() local char_p = ffi.typeof('char[?]') for c = n * 0x1000, n * 0x1000 + 0xFFF do local p = char_p(4, 0) - mbyte.utf_char2bytes(c, p) - eq(c, mbyte.utf_ptr2char(p)) - eq(charset.vim_iswordc(c), charset.vim_iswordp(p)) + lib.utf_char2bytes(c, p) + eq(c, lib.utf_ptr2char(p)) + eq(lib.vim_iswordc(c), lib.vim_iswordp(p)) end end) end - describe('utfc_ptr2char_len', function() + describe('utfc_ptr2schar_len', function() + local function test_seq(seq) + local firstc = ffi.new("int[1]") + local buf = ffi.new("char[32]") + lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc)) + return {ffi.string(buf), firstc[0]} + end + + local function byte(val) + return {string.char(val), val} + end itp('1-byte sequences', function() - local pcc = to_intp() - for c = 0, 255 do - eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1)) - eq(0, pcc[0]) + eq({'', 0}, test_seq{0}) + for c = 1, 127 do + eq(byte(c), test_seq{c}) + end + for c = 128, 255 do + eq({'', c}, test_seq{c}) end end) itp('2-byte sequences', function() - local pcc = to_intp() -- No combining characters - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f}) -- No combining characters - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f}) -- One UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2)) - eq(0, pcc[0]) + eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0xc0}) end) itp('3-byte sequences', function() - local pcc = to_intp() - -- No second UTF-8 character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80}) -- No combining character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80}) -- Combining character is U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc}) -- Incomplete combining character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc}) - -- One UTF-8 character - pcc = to_intp() - eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3)) - eq(0, pcc[0]) + -- One UTF-8 character (composing only) + eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90}) end) itp('4-byte sequences', function() - local pcc = to_intp() -- No following combining character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80}) -- No second UTF-8 character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80}) -- Combining character U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80}) -- No following UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80}) end) itp('5+-byte sequences', function() - local pcc = to_intp() - -- No following combining character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80}) -- No second UTF-8 character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80}) -- Combining character U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00}) -- Combining characters U+0300 and U+0301 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0000, pcc[2]) + eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81}) -- Combining characters U+0300, U+0301, U+0302 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0000, pcc[3]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}) -- Combining characters U+0300, U+0301, U+0302, U+0303 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0000, pcc[4]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0000, pcc[5]) - -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, - -- U+0305 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0305, pcc[5]) - eq(1, pcc[6]) - - -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, - -- U+0305, U+0306, but only save six (= MAX_MCO). - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0305, pcc[5]) - eq(0x0001, pcc[6]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}) - -- Only three following combining characters U+0300, U+0301, U+0302 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0000, pcc[3]) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}) + -- Only three following combining characters U+0300, U+0301, U+0302 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80}) -- No following UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc}) -- Combining characters U+1AB0 and U+0301 - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string( - {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9)) - eq(0x1ab0, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0000, pcc[2]) + eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}) end) end) |