diff options
-rw-r--r-- | runtime/doc/mbyte.txt | 3 | ||||
-rw-r--r-- | runtime/doc/news.txt | 7 | ||||
-rw-r--r-- | runtime/doc/vim_diff.txt | 13 | ||||
-rw-r--r-- | runtime/lua/vim/_meta/options.lua | 2 | ||||
-rw-r--r-- | src/nvim/change.c | 11 | ||||
-rw-r--r-- | src/nvim/charset.c | 19 | ||||
-rw-r--r-- | src/nvim/digraph.c | 2 | ||||
-rw-r--r-- | src/nvim/drawline.c | 261 | ||||
-rw-r--r-- | src/nvim/edit.c | 17 | ||||
-rw-r--r-- | src/nvim/eval.c | 2 | ||||
-rw-r--r-- | src/nvim/eval/funcs.c | 33 | ||||
-rw-r--r-- | src/nvim/ex_cmds.c | 110 | ||||
-rw-r--r-- | src/nvim/grid.c | 65 | ||||
-rw-r--r-- | src/nvim/grid_defs.h | 6 | ||||
-rw-r--r-- | src/nvim/insexpand.c | 2 | ||||
-rw-r--r-- | src/nvim/lua/stdlib.c | 2 | ||||
-rw-r--r-- | src/nvim/match.c | 2 | ||||
-rw-r--r-- | src/nvim/mbyte.c | 123 | ||||
-rw-r--r-- | src/nvim/mbyte.h | 1 | ||||
-rw-r--r-- | src/nvim/message.c | 10 | ||||
-rw-r--r-- | src/nvim/option_vars.h | 1 | ||||
-rw-r--r-- | src/nvim/spellsuggest.c | 2 | ||||
-rw-r--r-- | test/functional/ui/fold_spec.lua | 38 | ||||
-rw-r--r-- | test/functional/ui/multibyte_spec.lua | 30 | ||||
-rw-r--r-- | test/functional/ui/output_spec.lua | 4 | ||||
-rw-r--r-- | test/unit/mbyte_spec.lua | 243 |
26 files changed, 403 insertions, 606 deletions
diff --git a/runtime/doc/mbyte.txt b/runtime/doc/mbyte.txt index aedef87a09..0a7e0baad3 100644 --- a/runtime/doc/mbyte.txt +++ b/runtime/doc/mbyte.txt @@ -646,7 +646,8 @@ widespread as file format. A composing or combining character is used to change the meaning of the character before it. The combining characters are drawn on top of the preceding character. -Up to six combining characters can be displayed. +Too big combined characters cannot be displayed, but they can still be +inspected using the |g8| and |ga| commands described below. When editing text a composing character is mostly considered part of the preceding character. For example "x" will delete a character and its following composing characters by default. diff --git a/runtime/doc/news.txt b/runtime/doc/news.txt index 2f48ebfeff..cb3220a630 100644 --- a/runtime/doc/news.txt +++ b/runtime/doc/news.txt @@ -294,6 +294,13 @@ The following changes to existing APIs or features add new behavior. Note that syntax highlighting of code examples requires a matching parser and may be affected by custom queries. +• Support for rendering multibyte characters using composing characters has been + enhanced. The maximum limit have been increased from 1+6 codepoints to + 31 bytes, which is guaranteed to fit all chars from before but often more. + + NOTE: the regexp engine still has a hard-coded limit of considering + 6 composing chars only. + ============================================================================== REMOVED FEATURES *news-removed* diff --git a/runtime/doc/vim_diff.txt b/runtime/doc/vim_diff.txt index 05d7e5feb9..5e09cc2481 100644 --- a/runtime/doc/vim_diff.txt +++ b/runtime/doc/vim_diff.txt @@ -722,9 +722,16 @@ Options: < *'macatsui'* *'maxcombine'* *'mco'* - Nvim always displays up to 6 combining characters. You can still edit - text with more than 6 combining characters, you just can't see them. - Use |g8| or |ga|. See |mbyte-combining|. + Nvim counts maximum character sizes in bytes, not codepoints. This is + guaranteed to be big enough to always fit all chars properly displayed + in vim with 'maxcombine' set to 6. + + You can still edit text with larger characters than fits in the screen buffer, + you just can't see them. Use |g8| or |ga|. See |mbyte-combining|. + + NOTE: the rexexp engine still has a hard-coded limit of considering + 6 composing chars only. + *'maxmem'* Nvim delegates memory-management to the OS. *'maxmemtot'* Nvim delegates memory-management to the OS. printoptions diff --git a/runtime/lua/vim/_meta/options.lua b/runtime/lua/vim/_meta/options.lua index 19ae786177..be4a4dd49c 100644 --- a/runtime/lua/vim/_meta/options.lua +++ b/runtime/lua/vim/_meta/options.lua @@ -2576,7 +2576,7 @@ vim.go.fp = vim.go.formatprg --- security reasons. --- --- @type boolean -vim.o.fsync = false +vim.o.fsync = true vim.o.fs = vim.o.fsync vim.go.fsync = vim.o.fsync vim.go.fs = vim.go.fsync diff --git a/src/nvim/change.c b/src/nvim/change.c index 58718811bc..aa58779f5b 100644 --- a/src/nvim/change.c +++ b/src/nvim/change.c @@ -665,7 +665,7 @@ void ins_bytes_len(char *p, size_t len) /// convert bytes to a character. void ins_char(int c) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; size_t n = (size_t)utf_char2bytes(c, buf); // When "c" is 0x100, 0x200, etc. we don't want to insert a NUL byte. @@ -869,12 +869,9 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine) // If 'delcombine' is set and deleting (less than) one character, only // delete the last combining character. - if (p_deco && use_delcombine - && utfc_ptr2len(oldp + col) >= count) { - int cc[MAX_MCO]; - - (void)utfc_ptr2char(oldp + col, cc); - if (cc[0] != NUL) { + if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) { + char *p0 = oldp + col; + if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) { // Find the last composing char, there can be several. int n = col; do { diff --git a/src/nvim/charset.c b/src/nvim/charset.c index 0adcc09ec7..5dfc9c444d 100644 --- a/src/nvim/charset.c +++ b/src/nvim/charset.c @@ -302,15 +302,13 @@ size_t transstr_len(const char *const s, bool untab) while (*p) { const size_t l = (size_t)utfc_ptr2len(p); if (l > 1) { - int pcc[MAX_MCO + 1]; - pcc[0] = utfc_ptr2char(p, &pcc[1]); - - if (vim_isprintc(pcc[0])) { + if (vim_isprintc(utf_ptr2char(p))) { len += l; } else { - for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) { + for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) { + int c = utf_ptr2char(p + off); char hexbuf[9]; - len += transchar_hex(hexbuf, pcc[i]); + len += transchar_hex(hexbuf, c); } } p += l; @@ -349,16 +347,15 @@ size_t transstr_buf(const char *const s, const ssize_t slen, char *const buf, co if (buf_p + l > buf_e) { break; // Exceeded `buf` size. } - int pcc[MAX_MCO + 1]; - pcc[0] = utfc_ptr2char(p, &pcc[1]); - if (vim_isprintc(pcc[0])) { + if (vim_isprintc(utf_ptr2char(p))) { memmove(buf_p, p, l); buf_p += l; } else { - for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) { + for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) { + int c = utf_ptr2char(p + off); char hexbuf[9]; // <up to 6 bytes>NUL - const size_t hexlen = transchar_hex(hexbuf, pcc[i]); + const size_t hexlen = transchar_hex(hexbuf, c); if (buf_p + hexlen > buf_e) { break; } diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c index bc0ce99c5e..1bff78f90a 100644 --- a/src/nvim/digraph.c +++ b/src/nvim/digraph.c @@ -1654,7 +1654,7 @@ static void registerdigraph(int char1, int char2, int n) bool check_digraph_chars_valid(int char1, int char2) { if (char2 == 0) { - char msg[MB_MAXBYTES + 1]; + char msg[MB_MAXCHAR + 1]; msg[utf_char2bytes(char1, msg)] = NUL; semsg(_(e_digraph_must_be_just_two_characters_str), msg); return false; diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c index 11b4e55c5c..0cfab5cec9 100644 --- a/src/nvim/drawline.c +++ b/src/nvim/drawline.c @@ -228,14 +228,12 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells const char *p = *pp; int cells = utf_ptr2cells(p); int c_len = utfc_ptr2len(p); - int u8c, u8cc[MAX_MCO]; assert(maxcells > 0); if (cells > maxcells) { dest[0] = schar_from_ascii(' '); return 1; } - u8c = utfc_ptr2char(p, u8cc); if (*p == TAB) { cells = MIN(tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array), maxcells); } @@ -247,16 +245,14 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells for (int c = 0; c < cells; c++) { dest[c] = schar_from_ascii(' '); } - goto done; - } else if ((uint8_t)(*p) < 0x80 && u8cc[0] == 0) { - dest[0] = schar_from_ascii(*p); } else { - dest[0] = schar_from_cc(u8c, u8cc); - } - if (cells > 1) { - dest[1] = 0; + int u8c; + dest[0] = utfc_ptr2schar(p, &u8c); + if (cells > 1) { + dest[1] = 0; + } } -done: + *pp += c_len; return cells; } @@ -946,16 +942,6 @@ static void handle_inline_virtual_text(win_T *wp, winlinevars_T *wlv, ptrdiff_t } } -static bool check_mb_utf8(int *c, int *u8cc) -{ - if (utf_char2len(*c) > 1) { - *u8cc = 0; - *c = 0xc0; - return true; - } - return false; -} - static colnr_T get_trailcol(win_T *wp, const char *ptr, const char *line) { colnr_T trailcol = MAXCOL; @@ -1051,7 +1037,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl { winlinevars_T wlv; // variables passed between functions - int c = 0; // init for GCC colnr_T vcol_prev = -1; // "wlv.vcol" of previous character char *line; // current line char *ptr; // current position in "line" @@ -1096,8 +1081,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl int multi_attr = 0; // attributes desired by multibyte int mb_l = 1; // multi-byte byte length int mb_c = 0; // decoded multi-byte character - bool mb_utf8 = false; // screen char is UTF-8 char - int u8cc[MAX_MCO]; // composing UTF-8 chars + schar_T mb_schar; // complete screen char int change_start = MAXCOL; // first col of changed area int change_end = -1; // last col of changed area bool in_multispace = false; // in multiple consecutive spaces @@ -1951,34 +1935,25 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // For the '$' of the 'list' option, n_extra == 1, p_extra == "". if (wlv.n_extra > 0) { if (wlv.c_extra != NUL || (wlv.n_extra == 1 && wlv.c_final != NUL)) { - c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra; - mb_c = c; // doesn't handle non-utf-8 multi-byte! - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra; + mb_schar = schar_from_char(mb_c); + wlv.n_extra--; } else { assert(wlv.p_extra != NULL); - c = (uint8_t)(*wlv.p_extra); - mb_c = c; - // If the UTF-8 character is more than one byte: - // Decode it into "mb_c". mb_l = utfc_ptr2len(wlv.p_extra); - mb_utf8 = false; - if (mb_l > wlv.n_extra) { - mb_l = 1; - } else if (mb_l > 1) { - mb_c = utfc_ptr2char(wlv.p_extra, u8cc); - mb_utf8 = true; - c = 0xc0; - } - if (mb_l == 0) { // at the NUL at end-of-line + mb_schar = utfc_ptr2schar(wlv.p_extra, &mb_c); + // mb_l=0 at the end-of-line NUL + if (mb_l > wlv.n_extra || mb_l == 0) { mb_l = 1; } // If a double-width char doesn't fit display a '>' in the last column. + // Don't advance the pointer but put the character at the start of the next line. if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { - c = '>'; - mb_c = c; + mb_c = '>'; mb_l = 1; (void)mb_l; + mb_schar = schar_from_ascii(mb_c); multi_attr = win_hl_attr(wp, HLF_AT); if (wlv.cul_attr) { @@ -1986,18 +1961,11 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl ? hl_combine_attr(wlv.cul_attr, multi_attr) : hl_combine_attr(multi_attr, wlv.cul_attr); } - - // put the pointer back to output the double-width - // character at the start of the next line. - wlv.n_extra++; - wlv.p_extra--; } else { - wlv.n_extra -= mb_l - 1; - wlv.p_extra += mb_l - 1; + wlv.n_extra -= mb_l; + wlv.p_extra += mb_l; } - wlv.p_extra++; } - wlv.n_extra--; // Only restore search_attr and area_attr after "n_extra" in // the next screen line is also done. @@ -2026,58 +1994,40 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl } } else if (has_fold) { // skip writing the buffer line itself - c = NUL; + mb_c = NUL; } else { - int c0; char *prev_ptr = ptr; - // Get a character from the line itself. - c0 = c = (uint8_t)(*ptr); - mb_c = c; - - if (c == NUL) { + // first byte of next char + int c0 = (uint8_t)(*ptr); + if (c0 == NUL) { // no more cells to skip wlv.skip_cells = 0; } - // If the UTF-8 character is more than one byte: Decode it - // into "mb_c". + // Get a character from the line itself. mb_l = utfc_ptr2len(ptr); - mb_utf8 = false; - if (mb_l > 1) { - mb_c = utfc_ptr2char(ptr, u8cc); - // Overlong encoded ASCII or ASCII with composing char - // is displayed normally, except a NUL. - if (mb_c < 0x80) { - c0 = c = mb_c; - } - mb_utf8 = true; - - // At start of the line we can have a composing char. - // Draw it as a space with a composing char. - if (utf_iscomposing(mb_c)) { - for (int i = MAX_MCO - 1; i > 0; i--) { - u8cc[i] = u8cc[i - 1]; - } - u8cc[0] = mb_c; - mb_c = ' '; - } + mb_schar = utfc_ptr2schar(ptr, &mb_c); + + // Overlong encoded ASCII or ASCII with composing char + // is displayed normally, except a NUL. + if (mb_l > 1 && mb_c < 0x80) { + c0 = mb_c; } - if ((mb_l == 1 && c >= 0x80) + if ((mb_l == 1 && c0 >= 0x80) || (mb_l >= 1 && mb_c == 0) || (mb_l > 1 && (!vim_isprintc(mb_c)))) { // Illegal UTF-8 byte: display as <xx>. - // Non-BMP character : display as ? or fullwidth ?. + // Non-printable character : display as ? or fullwidth ?. transchar_hex(wlv.extra, mb_c); if (wp->w_p_rl) { // reverse rl_mirror_ascii(wlv.extra, NULL); } wlv.p_extra = wlv.extra; - c = (uint8_t)(*wlv.p_extra); mb_c = mb_ptr2char_adv((const char **)&wlv.p_extra); - mb_utf8 = (c >= 0x80); + mb_schar = schar_from_char(mb_c); wlv.n_extra = (int)strlen(wlv.p_extra); wlv.c_extra = NUL; wlv.c_final = NUL; @@ -2093,10 +2043,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // last column; the character is displayed at the start of the // next line. if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) { - c = '>'; - mb_c = c; - mb_utf8 = false; + mb_c = '>'; mb_l = 1; + mb_schar = schar_from_ascii(mb_c); multi_attr = win_hl_attr(wp, HLF_AT); // Put pointer back so that the character will be // displayed at the start of the next line. @@ -2112,15 +2061,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_extra = 1; wlv.c_extra = MB_FILLER_CHAR; wlv.c_final = NUL; - c = ' '; + mb_c = ' '; + mb_l = 1; + mb_schar = schar_from_ascii(mb_c); if (area_attr == 0 && search_attr == 0) { wlv.n_attr = wlv.n_extra + 1; wlv.extra_attr = win_hl_attr(wp, HLF_AT); saved_attr2 = wlv.char_attr; // save current attr } - mb_c = c; - mb_utf8 = false; - mb_l = 1; } ptr++; @@ -2159,11 +2107,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // no concealing past the end of the line, it interferes // with line highlighting. - if (c == NUL) { - syntax_flags = 0; - } else { - syntax_flags = get_syntax_info(&syntax_seqnr); - } + syntax_flags = (mb_c == 0) ? 0 : get_syntax_info(&syntax_seqnr); } if (has_decor && v > 0) { @@ -2198,7 +2142,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl spell_attr = 0; // do not calculate cap_col at the end of the line or when // only white space is following - if (c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) { + if (mb_c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) { char *p; hlf_T spell_hlf = HLF_COUNT; v -= mb_l - 1; @@ -2272,13 +2216,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // // So only allow to linebreak, once we have found chars not in // 'breakat' in the line. - if (wp->w_p_lbr && !wlv.need_lbr && c != NUL + if (wp->w_p_lbr && !wlv.need_lbr && mb_c != NUL && !vim_isbreak((uint8_t)(*ptr))) { wlv.need_lbr = true; } // Found last space before word: check for line break. - if (wp->w_p_lbr && c0 == c && wlv.need_lbr - && vim_isbreak(c) && !vim_isbreak((uint8_t)(*ptr))) { + if (wp->w_p_lbr && c0 == mb_c && mb_c < 128 && wlv.need_lbr + && vim_isbreak(mb_c) && !vim_isbreak((uint8_t)(*ptr))) { int mb_off = utf_head_off(line, ptr - 1); char *p = ptr - (mb_off + 1); chartabsize_T cts; @@ -2289,33 +2233,33 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_extra = win_lbr_chartabsize(&cts, NULL) - 1; clear_chartabsize_arg(&cts); - if (on_last_col && c != TAB) { + if (on_last_col && mb_c != TAB) { // Do not continue search/match highlighting over the // line break, but for TABs the highlighting should // include the complete width of the character search_attr = 0; } - if (c == TAB && wlv.n_extra + wlv.col > grid->cols) { + if (mb_c == TAB && wlv.n_extra + wlv.col > grid->cols) { wlv.n_extra = tabstop_padding(wlv.vcol, wp->w_buffer->b_p_ts, wp->w_buffer->b_p_vts_array) - 1; } wlv.c_extra = mb_off > 0 ? MB_FILLER_CHAR : ' '; wlv.c_final = NUL; - if (ascii_iswhite(c)) { - if (c == TAB) { + if (mb_c < 128 && ascii_iswhite(mb_c)) { + if (mb_c == TAB) { // See "Tab alignment" below. FIX_FOR_BOGUSCOLS; } if (!wp->w_p_list) { - c = ' '; + mb_c = ' '; + mb_schar = schar_from_ascii(mb_c); } } } if (wp->w_p_list) { - in_multispace = c == ' ' && (*ptr == ' ' - || (prev_ptr > line && prev_ptr[-1] == ' ')); + in_multispace = mb_c == ' ' && (*ptr == ' ' || (prev_ptr > line && prev_ptr[-1] == ' ')); if (!in_multispace) { multispace_pos = 0; } @@ -2325,61 +2269,56 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // But not when the character is followed by a composing // character (use mb_l to check that). if (wp->w_p_list - && ((((c == 160 && mb_l == 1) - || (mb_utf8 - && ((mb_c == 160 && mb_l == 2) - || (mb_c == 0x202f && mb_l == 3)))) + && ((((mb_c == 160 && mb_l == 2) || (mb_c == 0x202f && mb_l == 3)) && wp->w_p_lcs_chars.nbsp) - || (c == ' ' + || (mb_c == ' ' && mb_l == 1 && (wp->w_p_lcs_chars.space || (in_multispace && wp->w_p_lcs_chars.multispace != NULL)) && ptr - line >= leadcol && ptr - line <= trailcol))) { if (in_multispace && wp->w_p_lcs_chars.multispace != NULL) { - c = wp->w_p_lcs_chars.multispace[multispace_pos++]; + mb_c = wp->w_p_lcs_chars.multispace[multispace_pos++]; if (wp->w_p_lcs_chars.multispace[multispace_pos] == NUL) { multispace_pos = 0; } } else { - c = (c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp; + mb_c = (mb_c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp; } wlv.n_attr = 1; wlv.extra_attr = win_hl_attr(wp, HLF_0); saved_attr2 = wlv.char_attr; // save current attr - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } - if (c == ' ' && ((trailcol != MAXCOL && ptr > line + trailcol) - || (leadcol != 0 && ptr < line + leadcol))) { + if (mb_c == ' ' && mb_l == 1 && ((trailcol != MAXCOL && ptr > line + trailcol) + || (leadcol != 0 && ptr < line + leadcol))) { if (leadcol != 0 && in_multispace && ptr < line + leadcol && wp->w_p_lcs_chars.leadmultispace != NULL) { - c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++]; + mb_c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++]; if (wp->w_p_lcs_chars.leadmultispace[multispace_pos] == NUL) { multispace_pos = 0; } } else if (ptr > line + trailcol && wp->w_p_lcs_chars.trail) { - c = wp->w_p_lcs_chars.trail; + mb_c = wp->w_p_lcs_chars.trail; } else if (ptr < line + leadcol && wp->w_p_lcs_chars.lead) { - c = wp->w_p_lcs_chars.lead; + mb_c = wp->w_p_lcs_chars.lead; } else if (leadcol != 0 && wp->w_p_lcs_chars.space) { - c = wp->w_p_lcs_chars.space; + mb_c = wp->w_p_lcs_chars.space; } wlv.n_attr = 1; wlv.extra_attr = win_hl_attr(wp, HLF_0); saved_attr2 = wlv.char_attr; // save current attr - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } } // Handling of non-printable characters. - if (!vim_isprintc(c)) { + if (!vim_isprintc(mb_c)) { // when getting a character from the file, we may have to // turn it into something else on the way to putting it on the screen. - if (c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) { + if (mb_c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) { int tab_len = 0; colnr_T vcol_adjusted = wlv.vcol; // removed showbreak length char *const sbr = get_showbreak_value(wp); @@ -2422,7 +2361,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl if (wlv.n_extra > 0) { len += wlv.n_extra - tab_len; } - c = wp->w_p_lcs_chars.tab1; + mb_c = wp->w_p_lcs_chars.tab1; p = get_extra_buf((size_t)len + 1); memset(p, ' ', (size_t)len); p[len] = NUL; @@ -2470,11 +2409,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl } } - mb_utf8 = false; // don't draw as UTF-8 if (wp->w_p_list) { - c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3) - ? wp->w_p_lcs_chars.tab3 - : wp->w_p_lcs_chars.tab1; + mb_c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3) + ? wp->w_p_lcs_chars.tab3 : wp->w_p_lcs_chars.tab1; if (wp->w_p_lbr && wlv.p_extra != NULL && *wlv.p_extra != NUL) { wlv.c_extra = NUL; // using p_extra from above } else { @@ -2484,14 +2421,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_attr = tab_len + 1; wlv.extra_attr = win_hl_attr(wp, HLF_0); saved_attr2 = wlv.char_attr; // save current attr - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); } else { wlv.c_final = NUL; wlv.c_extra = ' '; - c = ' '; + mb_c = ' '; } - } else if (c == NUL + mb_schar = schar_from_char(mb_c); + } else if (mb_c == NUL && (wp->w_p_list || ((wlv.fromcol >= 0 || fromcol_prev >= 0) && wlv.tocol > wlv.vcol @@ -2515,20 +2451,19 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_extra = 0; } if (wp->w_p_list && wp->w_p_lcs_chars.eol > 0) { - c = wp->w_p_lcs_chars.eol; + mb_c = wp->w_p_lcs_chars.eol; } else { - c = ' '; + mb_c = ' '; } lcs_eol_one = -1; ptr--; // put it back at the NUL wlv.extra_attr = win_hl_attr(wp, HLF_AT); wlv.n_attr = 1; - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); - } else if (c != NUL) { - wlv.p_extra = transchar_buf(wp->w_buffer, c); + mb_schar = schar_from_char(mb_c); + } else if (mb_c != NUL) { + wlv.p_extra = transchar_buf(wp->w_buffer, mb_c); if (wlv.n_extra == 0) { - wlv.n_extra = byte2cells(c) - 1; + wlv.n_extra = byte2cells(mb_c) - 1; } if ((dy_flags & DY_UHEX) && wp->w_p_rl) { rl_mirror_ascii(wlv.p_extra, NULL); // reverse "<12>" @@ -2538,7 +2473,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl if (wp->w_p_lbr) { char *p; - c = (uint8_t)(*wlv.p_extra); + mb_c = (uint8_t)(*wlv.p_extra); p = get_extra_buf((size_t)wlv.n_extra + 1); memset(p, ' ', (size_t)wlv.n_extra); strncpy(p, // NOLINT(runtime/printf) @@ -2547,20 +2482,21 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl p[wlv.n_extra] = NUL; wlv.p_extra = p; } else { - wlv.n_extra = byte2cells(c) - 1; - c = (uint8_t)(*wlv.p_extra++); + wlv.n_extra = byte2cells(mb_c) - 1; + mb_c = (uint8_t)(*wlv.p_extra++); } wlv.n_attr = wlv.n_extra + 1; wlv.extra_attr = win_hl_attr(wp, HLF_8); saved_attr2 = wlv.char_attr; // save current attr - mb_utf8 = false; // don't draw as UTF-8 + mb_schar = schar_from_ascii(mb_c); } else if (VIsual_active && (VIsual_mode == Ctrl_V || VIsual_mode == 'v') && virtual_active() && wlv.tocol != MAXCOL && wlv.vcol < wlv.tocol && wlv.col < grid->cols) { - c = ' '; + mb_c = ' '; + mb_schar = schar_from_char(mb_c); ptr--; // put it back at the NUL } } @@ -2580,18 +2516,18 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // First time at this concealed item: display one // character. if (has_match_conc && match_conc) { - c = match_conc; + mb_c = match_conc; } else if (decor_conceal && decor_state.conceal_char) { - c = decor_state.conceal_char; + mb_c = decor_state.conceal_char; if (decor_state.conceal_attr) { wlv.char_attr = decor_state.conceal_attr; } } else if (syn_get_sub_char() != NUL) { - c = syn_get_sub_char(); + mb_c = syn_get_sub_char(); } else if (wp->w_p_lcs_chars.conceal != NUL) { - c = wp->w_p_lcs_chars.conceal; + mb_c = wp->w_p_lcs_chars.conceal; } else { - c = ' '; + mb_c = ' '; } prev_syntax_id = syntax_seqnr; @@ -2610,8 +2546,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl is_concealing = true; wlv.skip_cells = 1; } - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } else { prev_syntax_id = 0; is_concealing = false; @@ -2654,8 +2589,8 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl && (wp->w_p_wrap ? (wp->w_skipcol > 0 && wlv.row == 0) : wp->w_leftcol > 0) && wlv.filler_todo <= 0 && wlv.draw_state > WL_STC - && c != NUL) { - c = wp->w_p_lcs_chars.prec; + && mb_c != NUL) { + mb_c = wp->w_p_lcs_chars.prec; lcs_prec_todo = NUL; if (utf_char2cells(mb_c) > 1) { // Double-width character being overwritten by the "precedes" @@ -2666,15 +2601,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl wlv.n_attr = 2; wlv.extra_attr = win_hl_attr(wp, HLF_AT); } - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); saved_attr3 = wlv.char_attr; // save current attr wlv.char_attr = win_hl_attr(wp, HLF_AT); // overwriting char_attr n_attr3 = 1; } // At end of the text line or just after the last character. - if (c == NUL && eol_hl_off == 0) { + if (mb_c == NUL && eol_hl_off == 0) { // flag to indicate whether prevcol equals startcol of search_hl or // one of the matches bool prevcol_hl_flag = get_prevcol_hl_flag(wp, &screen_search_hl, @@ -2728,7 +2662,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl } // At end of the text line. - if (c == NUL) { + if (mb_c == NUL) { // Highlight 'cursorcolumn' & 'colorcolumn' past end of the line. if (wp->w_p_wrap) { v = wlv.startrow == 0 ? wp->w_skipcol : 0; @@ -2874,10 +2808,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl || lcs_eol_one > 0 || (wlv.n_extra > 0 && (wlv.c_extra != NUL || *wlv.p_extra != NUL)) || has_more_inline_virt(&wlv, v)) { - c = wp->w_p_lcs_chars.ext; + mb_c = wp->w_p_lcs_chars.ext; wlv.char_attr = win_hl_attr(wp, HLF_AT); - mb_c = c; - mb_utf8 = check_mb_utf8(&c, u8cc); + mb_schar = schar_from_char(mb_c); } } @@ -2923,11 +2856,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl // Skip characters that are left of the screen for 'nowrap'. if (wlv.draw_state < WL_LINE || wlv.skip_cells <= 0) { // Store the character. - if (mb_utf8) { - linebuf_char[wlv.off] = schar_from_cc(mb_c, u8cc); - } else { - linebuf_char[wlv.off] = schar_from_ascii((char)c); - } + linebuf_char[wlv.off] = mb_schar; if (multi_attr) { linebuf_attr[wlv.off] = multi_attr; multi_attr = 0; diff --git a/src/nvim/edit.c b/src/nvim/edit.c index ce547b55fe..eb5ea2c873 100644 --- a/src/nvim/edit.c +++ b/src/nvim/edit.c @@ -1462,7 +1462,7 @@ void edit_putchar(int c, bool highlight) pc_status = PC_STATUS_SET; } - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; grid_line_puts(pc_col, buf, utf_char2bytes(c, buf), attr); grid_line_flush(); } @@ -2176,7 +2176,7 @@ void insertchar(int c, int flags, int second_indent) int cc; if ((cc = utf_char2len(c)) > 1) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; utf_char2bytes(c, buf); buf[cc] = NUL; @@ -3681,7 +3681,6 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) int cc; int temp = 0; // init for GCC bool did_backspace = false; - int cpc[MAX_MCO]; // composing characters bool call_fix_indent = false; // can't delete anything in an empty file @@ -3910,15 +3909,15 @@ static bool ins_bs(int c, int mode, int *inserted_space_p) if (State & REPLACE_FLAG) { replace_do_bs(-1); } else { - const int l_p_deco = p_deco; - if (l_p_deco) { - (void)utfc_ptr2char(get_cursor_pos_ptr(), cpc); + bool has_composing = false; + if (p_deco) { + char *p0 = get_cursor_pos_ptr(); + has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0)); } (void)del_char(false); // If there are combining characters and 'delcombine' is set - // move the cursor back. Don't back up before the base - // character. - if (l_p_deco && cpc[0] != NUL) { + // move the cursor back. Don't back up before the base character. + if (has_composing) { inc_cursor(); } if (revins_chars) { diff --git a/src/nvim/eval.c b/src/nvim/eval.c index ed70091077..c073f30547 100644 --- a/src/nvim/eval.c +++ b/src/nvim/eval.c @@ -7117,7 +7117,7 @@ dict_T *get_vim_var_dict(int idx) FUNC_ATTR_PURE /// Set v:char to character "c". void set_vim_var_char(int c) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; buf[utf_char2bytes(c, buf)] = NUL; set_vim_var_string(VV_CHAR, buf, -1); diff --git a/src/nvim/eval/funcs.c b/src/nvim/eval/funcs.c index c6909245af..8ef208f291 100644 --- a/src/nvim/eval/funcs.c +++ b/src/nvim/eval/funcs.c @@ -5134,7 +5134,7 @@ static void f_nr2char(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) return; } - char buf[MB_MAXBYTES]; + char buf[MB_MAXCHAR]; const int len = utf_char2bytes((int)num, buf); rettv->v_type = VAR_STRING; @@ -6891,7 +6891,7 @@ static void f_screenchar(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) { c = -1; } else { - char buf[MB_MAXBYTES + 1]; + char buf[MAX_SCHAR_SIZE + 1]; schar_get(buf, grid_getchar(grid, row, col, NULL)); c = utf_ptr2char(buf); } @@ -6907,24 +6907,22 @@ static void f_screenchars(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) ScreenGrid *grid; screenchar_adjust(&grid, &row, &col); + tv_list_alloc_ret(rettv, kListLenMayKnow); if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) { - tv_list_alloc_ret(rettv, 0); return; } - char buf[MB_MAXBYTES + 1]; + char buf[MAX_SCHAR_SIZE + 1]; schar_get(buf, grid_getchar(grid, row, col, NULL)); - int pcc[MAX_MCO]; - int c = utfc_ptr2char(buf, pcc); - int composing_len = 0; - while (composing_len < MAX_MCO && pcc[composing_len] != 0) { - composing_len++; - } - tv_list_alloc_ret(rettv, composing_len + 1); - tv_list_append_number(rettv->vval.v_list, c); - for (int i = 0; i < composing_len; i++) { - tv_list_append_number(rettv->vval.v_list, pcc[i]); - } + + // schar values are already processed chars which are always NUL-terminated. + // A single [0] is expected when char is NUL. + size_t i = 0; + do { + int c = utf_ptr2char(buf + i); + tv_list_append_number(rettv->vval.v_list, c); + i += (size_t)utf_ptr2len(buf + i); + } while (buf[i] != NUL); } /// "screencol()" function @@ -6957,7 +6955,7 @@ static void f_screenstring(typval_T *argvars, typval_T *rettv, EvalFuncData fptr return; } - char buf[MB_MAXBYTES + 1]; + char buf[MAX_SCHAR_SIZE + 1]; schar_get(buf, grid_getchar(grid, row, col, NULL)); rettv->vval.v_string = xstrdup(buf); } @@ -7413,8 +7411,7 @@ static void f_setcharsearch(typval_T *argvars, typval_T *rettv, EvalFuncData fpt char *const csearch = tv_dict_get_string(d, "char", false); if (csearch != NULL) { - int pcc[MAX_MCO]; - const int c = utfc_ptr2char(csearch, pcc); + int c = utf_ptr2char(csearch); set_last_csearch(c, csearch, utfc_ptr2len(csearch)); } diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c index 692b320335..d92be6404b 100644 --- a/src/nvim/ex_cmds.c +++ b/src/nvim/ex_cmds.c @@ -131,17 +131,22 @@ static const char e_non_numeric_argument_to_z[] /// ":ascii" and "ga" implementation void do_ascii(exarg_T *eap) { - char *dig; - int cc[MAX_MCO]; - int c = utfc_ptr2char(get_cursor_pos_ptr(), cc); - if (c == NUL) { + char *data = get_cursor_pos_ptr(); + size_t len = (size_t)utfc_ptr2len(data); + + if (len == 0) { msg("NUL", 0); return; } - size_t iobuff_len = 0; + bool need_clear = true; + msg_sb_eol(); + msg_start(); + + int c = utf_ptr2char(data); + size_t off = 0; - int ci = 0; + // TODO(bfredl): merge this with the main loop if (c < 0x80) { if (c == NL) { // NUL is stored as NL. c = NUL; @@ -160,46 +165,29 @@ void do_ascii(exarg_T *eap) char buf2[20]; buf2[0] = NUL; - dig = get_digraph_for_char(cval); + char *dig = get_digraph_for_char(cval); if (dig != NULL) { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - _("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"), - transchar(c), buf1, buf2, cval, cval, cval, dig); + vim_snprintf(IObuff, sizeof(IObuff), + _("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"), + transchar(c), buf1, buf2, cval, cval, cval, dig); } else { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - _("<%s>%s%s %d, Hex %02x, Octal %03o"), - transchar(c), buf1, buf2, cval, cval, cval); - } - - c = cc[ci++]; - } - -#define SPACE_FOR_DESC (1 + 1 + 1 + MB_MAXBYTES + 16 + 4 + 3 + 3 + 1) - // Space for description: - // - 1 byte for separator (starting from second entry) - // - 1 byte for "<" - // - 1 byte for space to draw composing character on (optional, but really - // mostly required) - // - up to MB_MAXBYTES bytes for character itself - // - 16 bytes for raw text ("> , Hex , Octal "). - // - at least 4 bytes for hexadecimal representation - // - at least 3 bytes for decimal representation - // - at least 3 bytes for octal representation - // - 1 byte for NUL - // - // Taking into account MAX_MCO and characters which need 8 bytes for - // hexadecimal representation, but not taking translation into account: - // resulting string will occupy less then 400 bytes (conservative estimate). - // - // Less then 1000 bytes if translation multiplies number of bytes needed for - // raw text by 6, so it should always fit into 1025 bytes reserved for IObuff. + vim_snprintf(IObuff, sizeof(IObuff), + _("<%s>%s%s %d, Hex %02x, Octal %03o"), + transchar(c), buf1, buf2, cval, cval, cval); + } + + msg_multiline(IObuff, 0, true, &need_clear); + + off += (size_t)utf_ptr2len(data); // needed for overlong ascii? + } // Repeat for combining characters, also handle multiby here. - while (c >= 0x80 && iobuff_len < sizeof(IObuff) - SPACE_FOR_DESC) { + while (off < len) { + c = utf_ptr2char(data + off); + + size_t iobuff_len = 0; // This assumes every multi-byte char is printable... - if (iobuff_len > 0) { + if (off > 0) { IObuff[iobuff_len++] = ' '; } IObuff[iobuff_len++] = '<'; @@ -208,32 +196,30 @@ void do_ascii(exarg_T *eap) } iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len); - dig = get_digraph_for_char(c); + char *dig = get_digraph_for_char(c); if (dig != NULL) { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - (c < 0x10000 - ? _("> %d, Hex %04x, Oct %o, Digr %s") - : _("> %d, Hex %08x, Oct %o, Digr %s")), - c, c, c, dig); + vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len, + (c < 0x10000 + ? _("> %d, Hex %04x, Oct %o, Digr %s") + : _("> %d, Hex %08x, Oct %o, Digr %s")), + c, c, c, dig); } else { - iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len, - sizeof(IObuff) - iobuff_len, - (c < 0x10000 - ? _("> %d, Hex %04x, Octal %o") - : _("> %d, Hex %08x, Octal %o")), - c, c, c); - } - if (ci == MAX_MCO) { - break; + vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len, + (c < 0x10000 + ? _("> %d, Hex %04x, Octal %o") + : _("> %d, Hex %08x, Octal %o")), + c, c, c); } - c = cc[ci++]; - } - if (ci != MAX_MCO && c != 0) { - xstrlcpy(IObuff + iobuff_len, " ...", sizeof(IObuff) - iobuff_len); + + msg_multiline(IObuff, 0, true, &need_clear); + + off += (size_t)utf_ptr2len(data + off); // needed for overlong ascii? } - msg(IObuff, 0); + if (need_clear) { + msg_clr_eos(); + } + msg_end(); } /// ":left", ":center" and ":right": align text. diff --git a/src/nvim/grid.c b/src/nvim/grid.c index f21b7e3a90..6320abe4ea 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -68,21 +68,6 @@ void grid_adjust(ScreenGrid **grid, int *row_off, int *col_off) } } -/// Put a unicode char, and up to MAX_MCO composing chars, in a screen cell. -schar_T schar_from_cc(int c, int u8cc[MAX_MCO]) -{ - char buf[MAX_SCHAR_SIZE]; - int len = utf_char2bytes(c, buf); - for (int i = 0; i < MAX_MCO; i++) { - if (u8cc[i] == 0) { - break; - } - len += utf_char2bytes(u8cc[i], buf + len); - } - buf[len] = 0; - return schar_from_buf(buf, (size_t)len); -} - schar_T schar_from_str(char *str) { if (str == NULL) { @@ -243,22 +228,21 @@ void line_do_arabic_shape(schar_T *buf, int cols) schar_get(scbuf, buf[i]); char scbuf_new[MAX_SCHAR_SIZE]; - int len = utf_char2bytes(c0new, scbuf_new); + size_t len = (size_t)utf_char2bytes(c0new, scbuf_new); if (c1new) { - len += utf_char2bytes(c1new, scbuf_new + len); + len += (size_t)utf_char2bytes(c1new, scbuf_new + len); } int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0); size_t rest = strlen(scbuf + off); - if (rest + (size_t)off + 1 > MAX_SCHAR_SIZE) { - // TODO(bfredl): this cannot happen just yet, as we only construct - // schar_T values with up to MAX_MCO+1 composing codepoints. When code - // is improved so that MAX_SCHAR_SIZE becomes the only/sharp limit, - // we need be able to peel off a composing char which doesn't fit anymore. - abort(); + if (rest + len + 1 > MAX_SCHAR_SIZE) { + // Too bigly, discard one code-point. + // This should be enough as c0 cannot grow more than from 2 to 4 bytes + // (base arabic to extended arabic) + rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1; } memcpy(scbuf_new + len, scbuf + off, rest); - buf[i] = schar_from_buf(scbuf_new, (size_t)len + rest); + buf[i] = schar_from_buf(scbuf_new, len + rest); next: c0prev = c0; @@ -289,9 +273,9 @@ static bool grid_invalid_row(ScreenGrid *grid, int row) return grid->attrs[grid->line_offset[row]] < 0; } -/// Get a single character directly from grid.chars into "bytes", which must -/// have a size of "MB_MAXBYTES + 1". -/// If "attrp" is not NULL, return the character's attribute in "*attrp". +/// Get a single character directly from grid.chars +/// +/// @param[out] attrp set to the character's attribute (optional) schar_T grid_getchar(ScreenGrid *grid, int row, int col, int *attrp) { grid_adjust(&grid, &row, &col); @@ -385,42 +369,35 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) { const char *ptr = text; int len = textlen; - int u8cc[MAX_MCO]; assert(grid_line_grid); int start_col = col; int max_col = grid_line_maxcol; - while (col < max_col - && (len < 0 || (int)(ptr - text) < len) - && *ptr != NUL) { + while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) { // check if this is the first byte of a multibyte int mbyte_blen = len > 0 ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr)) : utfc_ptr2len(ptr); - int u8c = len >= 0 - ? utfc_ptr2char_len(ptr, u8cc, (int)((text + len) - ptr)) - : utfc_ptr2char(ptr, u8cc); - int mbyte_cells = utf_char2cells(u8c); + int firstc; + schar_T schar = len >= 0 + ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc) + : utfc_ptr2schar(ptr, &firstc); + int mbyte_cells = utf_char2cells(firstc); if (mbyte_cells > 2) { mbyte_cells = 1; - u8c = 0xFFFD; - u8cc[0] = 0; + + schar = schar_from_char(0xFFFD); } if (col + mbyte_cells > max_col) { // Only 1 cell left, but character requires 2 cells: // display a '>' in the last column to avoid wrapping. */ - u8c = '>'; - u8cc[0] = 0; + schar = schar_from_ascii('>'); mbyte_cells = 1; } - schar_T buf; - // TODO(bfredl): why not just keep the original byte sequence. - buf = schar_from_cc(u8c, u8cc); - // When at the start of the text and overwriting the right half of a // two-cell character in the same grid, truncate that into a '>'. if (ptr == text && col > grid_line_first && col < grid_line_last @@ -428,7 +405,7 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) linebuf_char[col - 1] = schar_from_ascii('>'); } - linebuf_char[col] = buf; + linebuf_char[col] = schar; linebuf_attr[col] = attr; linebuf_vcol[col] = -1; if (mbyte_cells == 2) { diff --git a/src/nvim/grid_defs.h b/src/nvim/grid_defs.h index 11e736fc0c..3cc2d788d3 100644 --- a/src/nvim/grid_defs.h +++ b/src/nvim/grid_defs.h @@ -7,8 +7,8 @@ #include "nvim/pos.h" #include "nvim/types.h" -#define MAX_MCO 6 // fixed value for 'maxcombine' -// Includes final NUL. at least 4*(MAX_MCO+1)+1 +// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29 +// ensures we can fit all composed chars which did fit before. #define MAX_SCHAR_SIZE 32 // if data[0] is 0xFF, then data[1..4] is a 24-bit index (in machine endianness) @@ -35,7 +35,7 @@ enum { /// we can avoid sending bigger updates than necessary to the Ul layer. /// /// Screen cells are stored as NUL-terminated UTF-8 strings, and a cell can -/// contain up to MAX_MCO composing characters after the base character. +/// contain composing characters as many as fits in MAX_SCHAR_SIZE-1 bytes /// The composing characters are to be drawn on top of the original character. /// The content after the NUL is not defined (so comparison must be done a /// single cell at a time). Double-width characters are stored in the left cell, diff --git a/src/nvim/insexpand.c b/src/nvim/insexpand.c index f565d5b9e8..adbd2a5315 100644 --- a/src/nvim/insexpand.c +++ b/src/nvim/insexpand.c @@ -1743,7 +1743,7 @@ void ins_compl_addleader(int c) return; } if ((cc = utf_char2len(c)) > 1) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; utf_char2bytes(c, buf); buf[cc] = NUL; diff --git a/src/nvim/lua/stdlib.c b/src/nvim/lua/stdlib.c index 5072d14c0e..a200b0a32f 100644 --- a/src/nvim/lua/stdlib.c +++ b/src/nvim/lua/stdlib.c @@ -224,7 +224,7 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL if (offset < 0 || offset > (intptr_t)s1_len) { return luaL_error(lstate, "index out of range"); } - int head_offset = utf_cp_head_off(s1, s1 + offset - 1); + int head_offset = -utf_cp_head_off(s1, s1 + offset - 1); lua_pushinteger(lstate, head_offset); return 1; } diff --git a/src/nvim/match.c b/src/nvim/match.c index 3420455e5f..0cd0426cff 100644 --- a/src/nvim/match.c +++ b/src/nvim/match.c @@ -939,7 +939,7 @@ void f_getmatches(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) tv_dict_add_nr(dict, S_LEN("id"), (varnumber_T)cur->mit_id); if (cur->mit_conceal_char) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; buf[utf_char2bytes(cur->mit_conceal_char, buf)] = NUL; tv_dict_add_str(dict, S_LEN("conceal"), buf); diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 0d468889a4..3a13aeddb8 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -48,6 +48,7 @@ #include "nvim/getchar.h" #include "nvim/gettext.h" #include "nvim/globals.h" +#include "nvim/grid.h" #include "nvim/grid_defs.h" #include "nvim/iconv.h" #include "nvim/keycodes.h" @@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2) return arabic_combine(utf_ptr2char(p1), c2); } -/// Convert a UTF-8 string to a wide character +/// Get the screen char at the beginning of a string /// -/// Also gets up to #MAX_MCO composing characters. +/// Caller is expected to check for things like unprintable chars etc +/// If first char in string is a composing char, prepend a space to display it correctly. /// -/// @param[out] pcc Location where to store composing characters. Must have -/// space at least for #MAX_MCO + 1 elements. +/// If "p" starts with an invalid sequence, zero is returned. /// -/// @return leading character. -int utfc_ptr2char(const char *p, int *pcc) +/// @param[out] firstc (required) The first codepoint of the screen char, +/// or the first byte of an invalid sequence +/// +/// @return the char +schar_T utfc_ptr2schar(const char *p, int *firstc) + FUNC_ATTR_NONNULL_ALL { - int i = 0; - int c = utf_ptr2char(p); - int len = utf_ptr2len(p); - - // Only accept a composing char when the first char isn't illegal. - if ((len > 1 || (uint8_t)(*p) < 0x80) - && (uint8_t)p[len] >= 0x80 - && utf_composinglike(p, p + len)) { - int cc = utf_ptr2char(p + len); - while (true) { - pcc[i++] = cc; - if (i == MAX_MCO) { - break; - } - len += utf_ptr2len(p + len); - if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) { - break; - } - } - } + *firstc = c; // NOT optional, you are gonna need it + bool first_compose = utf_iscomposing(c); + size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; + size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); - if (i < MAX_MCO) { // last composing char must be 0 - pcc[i] = 0; + if (len == 1 && (uint8_t)(*p) >= 0x80) { + return 0; // invalid sequence } - return c; + return schar_from_buf_first(p, len, first_compose); } -// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO -// composing characters. Use no more than p[maxlen]. -// -// @param [out] pcc: composing chars, last one is 0 -int utfc_ptr2char_len(const char *p, int *pcc, int maxlen) +/// Get the screen char at the beginning of a string with length +/// +/// Like utfc_ptr2schar but use no more than p[maxlen]. +schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) + FUNC_ATTR_NONNULL_ALL { assert(maxlen > 0); - int i = 0; + size_t len = (size_t)utf_ptr2len_len(p, maxlen); + if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { + // invalid or truncated sequence + *firstc = (uint8_t)(*p); + return 0; + } - int len = utf_ptr2len_len(p, maxlen); - // Is it safe to use utf_ptr2char()? - bool safe = len > 1 && len <= maxlen; - int c = safe ? utf_ptr2char(p) : (uint8_t)(*p); + int c = utf_ptr2char(p); + *firstc = c; + bool first_compose = utf_iscomposing(c); + maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose); + len = (size_t)utfc_ptr2len_len(p, maxlen); - // Only accept a composing char when the first char isn't illegal. - if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) { - for (; i < MAX_MCO; i++) { - int len_cc = utf_ptr2len_len(p + len, maxlen - len); - safe = len_cc > 1 && len_cc <= maxlen - len; - if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80 - || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) { - break; - } - len += len_cc; - } - } + return schar_from_buf_first(p, len, first_compose); +} - if (i < MAX_MCO) { - // last composing char must be 0 - pcc[i] = 0; +/// Caller must ensure there is space for `first_compose` +static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose) +{ + if (first_compose) { + char cbuf[MAX_SCHAR_SIZE]; + cbuf[0] = ' '; + memcpy(cbuf + 1, buf, len); + return schar_from_buf(cbuf, len + 1); + } else { + return schar_from_buf(buf, len); } - - return c; -#undef ISCOMPOSING } /// Get the length of a UTF-8 byte sequence representing a single codepoint @@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p) return 1; } - // Check for composing characters. We can handle only the first six, but - // skip all of them (otherwise the cursor would get stuck). + // Check for composing characters. int prevlen = 0; while (true) { if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { @@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in) /// Return the offset from "p" to the first byte of the codepoint it points /// to. Can start anywhere in a stream of bytes. /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters -/// separately and returns a negative offset. +/// separately. /// /// @param[in] base Pointer to start of string /// @param[in] p Pointer to byte for which to return the offset to the previous codepoint // -/// @return 0 if invalid sequence, else offset to previous codepoint +/// @return 0 if invalid sequence, else number of bytes to previous codepoint int utf_cp_head_off(const char *base, const char *p) { int i; @@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p) } // Find the first character that is not 10xx.xxxx - for (i = 0; p - i > base; i--) { - if (((uint8_t)p[i] & 0xc0) != 0x80) { + for (i = 0; p - i >= base; i++) { + if (((uint8_t)p[-i] & 0xc0) != 0x80) { break; } } - // Find the last character that is 10xx.xxxx - for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {} + // Find the last character that is 10xx.xxxx (condition terminates on NUL) + int j = 1; + while (((uint8_t)p[j] & 0xc0) == 0x80) { + j++; + } // Check for illegal sequence. - if (utf8len_tab[(uint8_t)p[i]] == 1) { + if (utf8len_tab[(uint8_t)p[-i]] != j + i) { return 0; } return i; diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h index 1d1a9439ad..c177f14ce2 100644 --- a/src/nvim/mbyte.h +++ b/src/nvim/mbyte.h @@ -7,6 +7,7 @@ #include "nvim/cmdexpand_defs.h" #include "nvim/eval/typval_defs.h" #include "nvim/func_attr.h" +#include "nvim/grid_defs.h" #include "nvim/mbyte_defs.h" #include "nvim/os/os_defs.h" #include "nvim/types.h" diff --git a/src/nvim/message.c b/src/nvim/message.c index ee1a9e60b0..9e9aa1fcd6 100644 --- a/src/nvim/message.c +++ b/src/nvim/message.c @@ -139,7 +139,7 @@ static int msg_grid_pos_at_flush = 0; static void ui_ext_msg_set_pos(int row, bool scrolled) { - char buf[MAX_MCO + 1]; + char buf[MB_MAXCHAR + 1]; size_t size = (size_t)utf_char2bytes(curwin->w_p_fcs_chars.msgsep, buf); buf[size] = '\0'; ui_call_msg_set_pos(msg_grid.handle, row, scrolled, @@ -1471,7 +1471,7 @@ void msg_putchar(int c) void msg_putchar_attr(int c, int attr) { - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; if (IS_SPECIAL(c)) { buf[0] = (char)K_SPECIAL; @@ -1560,12 +1560,6 @@ int msg_outtrans_len(const char *msgstr, int len, int attr) mode_displayed = false; } - // If the string starts with a composing character first draw a space on - // which the composing char can be drawn. - if (utf_iscomposing(utf_ptr2char(msgstr))) { - msg_puts_attr(" ", attr); - } - // Go over the string. Special characters are translated and printed. // Normal characters are printed several at a time. while (--len >= 0 && !got_int) { diff --git a/src/nvim/option_vars.h b/src/nvim/option_vars.h index f0c752a2b1..0193e43de7 100644 --- a/src/nvim/option_vars.h +++ b/src/nvim/option_vars.h @@ -556,6 +556,7 @@ EXTERN char *p_mp; ///< 'makeprg' EXTERN char *p_mps; ///< 'matchpairs' EXTERN OptInt p_mat; ///< 'matchtime' EXTERN OptInt p_mco; ///< 'maxcombine' +#define MAX_MCO 6 // fixed value for 'maxcombine' EXTERN OptInt p_mfd; ///< 'maxfuncdepth' EXTERN OptInt p_mmd; ///< 'maxmapdepth' EXTERN OptInt p_mmp; ///< 'maxmempattern' diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c index 15a58a4434..dab278e383 100644 --- a/src/nvim/spellsuggest.c +++ b/src/nvim/spellsuggest.c @@ -3019,7 +3019,7 @@ static int soundfold_find(slang_T *slang, char *word) static bool similar_chars(slang_T *slang, int c1, int c2) { int m1, m2; - char buf[MB_MAXBYTES + 1]; + char buf[MB_MAXCHAR + 1]; hashitem_T *hi; if (c1 >= 256) { diff --git a/test/functional/ui/fold_spec.lua b/test/functional/ui/fold_spec.lua index 9a0182ea29..1addf7088e 100644 --- a/test/functional/ui/fold_spec.lua +++ b/test/functional/ui/fold_spec.lua @@ -1102,8 +1102,6 @@ describe("folded lines", function() end) it("works with multibyte text", function() - -- Currently the only allowed value of 'maxcombine' - eq(6, meths.get_option_value('maxcombine', {})) eq(true, meths.get_option_value('arabicshape', {})) insert([[ å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢͟ العَرَبِيَّة @@ -1120,7 +1118,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | + å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | möre tex^t | {1:~ }| {1:~ }| @@ -1132,7 +1130,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | + å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ | möre tex^t | {1:~ }| {1:~ }| @@ -1156,7 +1154,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1168,7 +1166,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1192,7 +1190,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1204,7 +1202,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}| + {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}| {1:~ }| {1:~ }| {1:~ }| @@ -1228,7 +1226,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}| + {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}| {1:~ }| {1:~ }| {1:~ }| @@ -1240,7 +1238,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}| + {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}| {1:~ }| {1:~ }| {1:~ }| @@ -1265,7 +1263,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}{8: 1 }{7: +}| + {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}{8: 1 }{7: +}| {1: ~}| {1: ~}| {1: ~}| @@ -1277,7 +1275,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}{8: 1 }{7: +}| + {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}{8: 1 }{7: +}| {1: ~}| {1: ~}| {1: ~}| @@ -1301,7 +1299,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1313,7 +1311,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1337,7 +1335,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1349,7 +1347,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2 --^+}| + {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}| {1: ~}| {1: ~}| {1: ~}| @@ -1373,7 +1371,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å| + ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| @@ -1385,7 +1383,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å| + ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| @@ -1409,7 +1407,7 @@ describe("folded lines", function() [2:---------------------------------------------]| [3:---------------------------------------------]| ## grid 2 - ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å| + ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| @@ -1421,7 +1419,7 @@ describe("folded lines", function() ]]) else screen:expect([[ - ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å| + ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å| txet eröm| {1: ~}| {1: ~}| diff --git a/test/functional/ui/multibyte_spec.lua b/test/functional/ui/multibyte_spec.lua index 077dd1a779..d72bf27d6b 100644 --- a/test/functional/ui/multibyte_spec.lua +++ b/test/functional/ui/multibyte_spec.lua @@ -228,6 +228,36 @@ describe("multibyte rendering", function() ]]} end) + + it('works with arabicshape and multiple composing chars', function() + -- this tests an important edge case: arabicshape might increase the byte size of the base + -- character in a way so that the last composing char no longer fits. use "g8" on the text + -- to observe what is happening (the final E1 80 B7 gets deleted with 'arabicshape') + -- If we would increase the schar_t size, say from 32 to 64 bytes, we need to extend the + -- test text with even more zalgo energy to still touch this edge case. + + meths.buf_set_lines(0,0,-1,true, {"سلام့̀́̂̃̄̅̆̇̈̉̊̋̌"}) + command('set noarabicshape') + + screen:expect{grid=[[ + ^سلام့̀́̂̃̄̅̆̇̈̉̊̋̌ | + {1:~ }| + {1:~ }| + {1:~ }| + {1:~ }| + | + ]]} + + command('set arabicshape') + screen:expect{grid=[[ + ^ﺱﻼﻣ̀́̂̃̄̅̆̇̈̉̊̋̌ | + {1:~ }| + {1:~ }| + {1:~ }| + {1:~ }| + | + ]]} + end) end) describe('multibyte rendering: statusline', function() diff --git a/test/functional/ui/output_spec.lua b/test/functional/ui/output_spec.lua index 0dd1f0325c..7b93b74eac 100644 --- a/test/functional/ui/output_spec.lua +++ b/test/functional/ui/output_spec.lua @@ -225,8 +225,8 @@ describe("shell command :!", function() å | ref: å̲ | 1: å̲ | - 2: å ̲ | - 3: å ̲ | + 2: å ̲ | + 3: å ̲ | | {3:Press ENTER or type command to continue}^ | ]]) diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua index fdb1bceab0..cd94624570 100644 --- a/test/unit/mbyte_spec.lua +++ b/test/unit/mbyte_spec.lua @@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it) local ffi = helpers.ffi local eq = helpers.eq -local mbyte = helpers.cimport("./src/nvim/mbyte.h") -local charset = helpers.cimport('./src/nvim/charset.h') +local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h') describe('mbyte', function() - -- Array for composing characters - local intp = ffi.typeof('int[?]') - local function to_intp() - -- how to get MAX_MCO from globals.h? - return intp(7, 1) - end - -- Convert from bytes to string local function to_string(bytes) local s = {} @@ -30,14 +22,14 @@ describe('mbyte', function() itp('utf_ptr2char', function() -- For strings with length 1 the first byte is returned. for c = 0, 255 do - eq(c, mbyte.utf_ptr2char(to_string({c, 0}))) + eq(c, lib.utf_ptr2char(to_string({c, 0}))) end -- Some ill formed byte sequences that should not be recognized as UTF-8 -- First byte: 0xc0 or 0xc1 -- Second byte: 0x80 .. 0xbf - --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80}))) - --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf}))) + --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80}))) + --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf}))) -- -- Sequences with more than four bytes end) @@ -47,240 +39,133 @@ describe('mbyte', function() local char_p = ffi.typeof('char[?]') for c = n * 0x1000, n * 0x1000 + 0xFFF do local p = char_p(4, 0) - mbyte.utf_char2bytes(c, p) - eq(c, mbyte.utf_ptr2char(p)) - eq(charset.vim_iswordc(c), charset.vim_iswordp(p)) + lib.utf_char2bytes(c, p) + eq(c, lib.utf_ptr2char(p)) + eq(lib.vim_iswordc(c), lib.vim_iswordp(p)) end end) end - describe('utfc_ptr2char_len', function() + describe('utfc_ptr2schar_len', function() + local function test_seq(seq) + local firstc = ffi.new("int[1]") + local buf = ffi.new("char[32]") + lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc)) + return {ffi.string(buf), firstc[0]} + end + + local function byte(val) + return {string.char(val), val} + end itp('1-byte sequences', function() - local pcc = to_intp() - for c = 0, 255 do - eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1)) - eq(0, pcc[0]) + eq({'', 0}, test_seq{0}) + for c = 1, 127 do + eq(byte(c), test_seq{c}) + end + for c = 128, 255 do + eq({'', c}, test_seq{c}) end end) itp('2-byte sequences', function() - local pcc = to_intp() -- No combining characters - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f}) -- No combining characters - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f}) -- One UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2)) - eq(0, pcc[0]) + eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0xc0}) end) itp('3-byte sequences', function() - local pcc = to_intp() - -- No second UTF-8 character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80}) -- No combining character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80}) -- Combining character is U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc}) -- Incomplete combining character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc}) - -- One UTF-8 character - pcc = to_intp() - eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3)) - eq(0, pcc[0]) + -- One UTF-8 character (composing only) + eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90}) end) itp('4-byte sequences', function() - local pcc = to_intp() -- No following combining character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80}) -- No second UTF-8 character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80}) -- Combining character U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80}) -- No following UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80}) end) itp('5+-byte sequences', function() - local pcc = to_intp() - -- No following combining character - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80}) -- No second UTF-8 character - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80}) -- Combining character U+0300 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5)) - eq(0x0300, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00}) -- Combining characters U+0300 and U+0301 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0000, pcc[2]) + eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81}) -- Combining characters U+0300, U+0301, U+0302 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0000, pcc[3]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}) -- Combining characters U+0300, U+0301, U+0302, U+0303 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0000, pcc[4]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}) -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0000, pcc[5]) - -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, - -- U+0305 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0305, pcc[5]) - eq(1, pcc[6]) - - -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, - -- U+0305, U+0306, but only save six (= MAX_MCO). - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0303, pcc[3]) - eq(0x0304, pcc[4]) - eq(0x0305, pcc[5]) - eq(0x0001, pcc[6]) + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}) - -- Only three following combining characters U+0300, U+0301, U+0302 - pcc = to_intp() - eq(0x007f, mbyte.utfc_ptr2char_len(to_string( - {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13)) - eq(0x0300, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0302, pcc[2]) - eq(0x0000, pcc[3]) + -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}) + -- Only three following combining characters U+0300, U+0301, U+0302 + eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}) -- No UTF-8 sequence - pcc = to_intp() - eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80}) -- No following UTF-8 character - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f}) -- Combining character U+0301 - pcc = to_intp() - eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5)) - eq(0x0301, pcc[0]) - eq(0x0000, pcc[1]) + eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80}) -- One UTF-8 character - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5)) - eq(0, pcc[0]) + eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc}) -- Combining characters U+1AB0 and U+0301 - pcc = to_intp() - eq(0x100000, mbyte.utfc_ptr2char_len(to_string( - {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9)) - eq(0x1ab0, pcc[0]) - eq(0x0301, pcc[1]) - eq(0x0000, pcc[2]) + eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}) end) end) |