aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--runtime/doc/mbyte.txt3
-rw-r--r--runtime/doc/news.txt7
-rw-r--r--runtime/doc/vim_diff.txt13
-rw-r--r--runtime/lua/vim/_meta/options.lua2
-rw-r--r--src/nvim/change.c11
-rw-r--r--src/nvim/charset.c19
-rw-r--r--src/nvim/digraph.c2
-rw-r--r--src/nvim/drawline.c261
-rw-r--r--src/nvim/edit.c17
-rw-r--r--src/nvim/eval.c2
-rw-r--r--src/nvim/eval/funcs.c33
-rw-r--r--src/nvim/ex_cmds.c110
-rw-r--r--src/nvim/grid.c65
-rw-r--r--src/nvim/grid_defs.h6
-rw-r--r--src/nvim/insexpand.c2
-rw-r--r--src/nvim/lua/stdlib.c2
-rw-r--r--src/nvim/match.c2
-rw-r--r--src/nvim/mbyte.c123
-rw-r--r--src/nvim/mbyte.h1
-rw-r--r--src/nvim/message.c10
-rw-r--r--src/nvim/option_vars.h1
-rw-r--r--src/nvim/spellsuggest.c2
-rw-r--r--test/functional/ui/fold_spec.lua38
-rw-r--r--test/functional/ui/multibyte_spec.lua30
-rw-r--r--test/functional/ui/output_spec.lua4
-rw-r--r--test/unit/mbyte_spec.lua243
26 files changed, 403 insertions, 606 deletions
diff --git a/runtime/doc/mbyte.txt b/runtime/doc/mbyte.txt
index aedef87a09..0a7e0baad3 100644
--- a/runtime/doc/mbyte.txt
+++ b/runtime/doc/mbyte.txt
@@ -646,7 +646,8 @@ widespread as file format.
A composing or combining character is used to change the meaning of the
character before it. The combining characters are drawn on top of the
preceding character.
-Up to six combining characters can be displayed.
+Too big combined characters cannot be displayed, but they can still be
+inspected using the |g8| and |ga| commands described below.
When editing text a composing character is mostly considered part of the
preceding character. For example "x" will delete a character and its
following composing characters by default.
diff --git a/runtime/doc/news.txt b/runtime/doc/news.txt
index 2f48ebfeff..cb3220a630 100644
--- a/runtime/doc/news.txt
+++ b/runtime/doc/news.txt
@@ -294,6 +294,13 @@ The following changes to existing APIs or features add new behavior.
Note that syntax highlighting of code examples requires a matching parser
and may be affected by custom queries.
+• Support for rendering multibyte characters using composing characters has been
+ enhanced. The maximum limit have been increased from 1+6 codepoints to
+ 31 bytes, which is guaranteed to fit all chars from before but often more.
+
+ NOTE: the regexp engine still has a hard-coded limit of considering
+ 6 composing chars only.
+
==============================================================================
REMOVED FEATURES *news-removed*
diff --git a/runtime/doc/vim_diff.txt b/runtime/doc/vim_diff.txt
index 05d7e5feb9..5e09cc2481 100644
--- a/runtime/doc/vim_diff.txt
+++ b/runtime/doc/vim_diff.txt
@@ -722,9 +722,16 @@ Options:
<
*'macatsui'*
*'maxcombine'* *'mco'*
- Nvim always displays up to 6 combining characters. You can still edit
- text with more than 6 combining characters, you just can't see them.
- Use |g8| or |ga|. See |mbyte-combining|.
+ Nvim counts maximum character sizes in bytes, not codepoints. This is
+ guaranteed to be big enough to always fit all chars properly displayed
+ in vim with 'maxcombine' set to 6.
+
+ You can still edit text with larger characters than fits in the screen buffer,
+ you just can't see them. Use |g8| or |ga|. See |mbyte-combining|.
+
+ NOTE: the rexexp engine still has a hard-coded limit of considering
+ 6 composing chars only.
+
*'maxmem'* Nvim delegates memory-management to the OS.
*'maxmemtot'* Nvim delegates memory-management to the OS.
printoptions
diff --git a/runtime/lua/vim/_meta/options.lua b/runtime/lua/vim/_meta/options.lua
index 19ae786177..be4a4dd49c 100644
--- a/runtime/lua/vim/_meta/options.lua
+++ b/runtime/lua/vim/_meta/options.lua
@@ -2576,7 +2576,7 @@ vim.go.fp = vim.go.formatprg
--- security reasons.
---
--- @type boolean
-vim.o.fsync = false
+vim.o.fsync = true
vim.o.fs = vim.o.fsync
vim.go.fsync = vim.o.fsync
vim.go.fs = vim.go.fsync
diff --git a/src/nvim/change.c b/src/nvim/change.c
index 58718811bc..aa58779f5b 100644
--- a/src/nvim/change.c
+++ b/src/nvim/change.c
@@ -665,7 +665,7 @@ void ins_bytes_len(char *p, size_t len)
/// convert bytes to a character.
void ins_char(int c)
{
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
size_t n = (size_t)utf_char2bytes(c, buf);
// When "c" is 0x100, 0x200, etc. we don't want to insert a NUL byte.
@@ -869,12 +869,9 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
// If 'delcombine' is set and deleting (less than) one character, only
// delete the last combining character.
- if (p_deco && use_delcombine
- && utfc_ptr2len(oldp + col) >= count) {
- int cc[MAX_MCO];
-
- (void)utfc_ptr2char(oldp + col, cc);
- if (cc[0] != NUL) {
+ if (p_deco && use_delcombine && utfc_ptr2len(oldp + col) >= count) {
+ char *p0 = oldp + col;
+ if (utf_composinglike(p0, p0 + utf_ptr2len(p0))) {
// Find the last composing char, there can be several.
int n = col;
do {
diff --git a/src/nvim/charset.c b/src/nvim/charset.c
index 0adcc09ec7..5dfc9c444d 100644
--- a/src/nvim/charset.c
+++ b/src/nvim/charset.c
@@ -302,15 +302,13 @@ size_t transstr_len(const char *const s, bool untab)
while (*p) {
const size_t l = (size_t)utfc_ptr2len(p);
if (l > 1) {
- int pcc[MAX_MCO + 1];
- pcc[0] = utfc_ptr2char(p, &pcc[1]);
-
- if (vim_isprintc(pcc[0])) {
+ if (vim_isprintc(utf_ptr2char(p))) {
len += l;
} else {
- for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
+ for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
+ int c = utf_ptr2char(p + off);
char hexbuf[9];
- len += transchar_hex(hexbuf, pcc[i]);
+ len += transchar_hex(hexbuf, c);
}
}
p += l;
@@ -349,16 +347,15 @@ size_t transstr_buf(const char *const s, const ssize_t slen, char *const buf, co
if (buf_p + l > buf_e) {
break; // Exceeded `buf` size.
}
- int pcc[MAX_MCO + 1];
- pcc[0] = utfc_ptr2char(p, &pcc[1]);
- if (vim_isprintc(pcc[0])) {
+ if (vim_isprintc(utf_ptr2char(p))) {
memmove(buf_p, p, l);
buf_p += l;
} else {
- for (size_t i = 0; i < ARRAY_SIZE(pcc) && pcc[i]; i++) {
+ for (size_t off = 0; off < l; off += (size_t)utf_ptr2len(p + off)) {
+ int c = utf_ptr2char(p + off);
char hexbuf[9]; // <up to 6 bytes>NUL
- const size_t hexlen = transchar_hex(hexbuf, pcc[i]);
+ const size_t hexlen = transchar_hex(hexbuf, c);
if (buf_p + hexlen > buf_e) {
break;
}
diff --git a/src/nvim/digraph.c b/src/nvim/digraph.c
index bc0ce99c5e..1bff78f90a 100644
--- a/src/nvim/digraph.c
+++ b/src/nvim/digraph.c
@@ -1654,7 +1654,7 @@ static void registerdigraph(int char1, int char2, int n)
bool check_digraph_chars_valid(int char1, int char2)
{
if (char2 == 0) {
- char msg[MB_MAXBYTES + 1];
+ char msg[MB_MAXCHAR + 1];
msg[utf_char2bytes(char1, msg)] = NUL;
semsg(_(e_digraph_must_be_just_two_characters_str), msg);
return false;
diff --git a/src/nvim/drawline.c b/src/nvim/drawline.c
index 11b4e55c5c..0cfab5cec9 100644
--- a/src/nvim/drawline.c
+++ b/src/nvim/drawline.c
@@ -228,14 +228,12 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
const char *p = *pp;
int cells = utf_ptr2cells(p);
int c_len = utfc_ptr2len(p);
- int u8c, u8cc[MAX_MCO];
assert(maxcells > 0);
if (cells > maxcells) {
dest[0] = schar_from_ascii(' ');
return 1;
}
- u8c = utfc_ptr2char(p, u8cc);
if (*p == TAB) {
cells = MIN(tabstop_padding(vcol, buf->b_p_ts, buf->b_p_vts_array), maxcells);
}
@@ -247,16 +245,14 @@ static int line_putchar(buf_T *buf, const char **pp, schar_T *dest, int maxcells
for (int c = 0; c < cells; c++) {
dest[c] = schar_from_ascii(' ');
}
- goto done;
- } else if ((uint8_t)(*p) < 0x80 && u8cc[0] == 0) {
- dest[0] = schar_from_ascii(*p);
} else {
- dest[0] = schar_from_cc(u8c, u8cc);
- }
- if (cells > 1) {
- dest[1] = 0;
+ int u8c;
+ dest[0] = utfc_ptr2schar(p, &u8c);
+ if (cells > 1) {
+ dest[1] = 0;
+ }
}
-done:
+
*pp += c_len;
return cells;
}
@@ -946,16 +942,6 @@ static void handle_inline_virtual_text(win_T *wp, winlinevars_T *wlv, ptrdiff_t
}
}
-static bool check_mb_utf8(int *c, int *u8cc)
-{
- if (utf_char2len(*c) > 1) {
- *u8cc = 0;
- *c = 0xc0;
- return true;
- }
- return false;
-}
-
static colnr_T get_trailcol(win_T *wp, const char *ptr, const char *line)
{
colnr_T trailcol = MAXCOL;
@@ -1051,7 +1037,6 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
{
winlinevars_T wlv; // variables passed between functions
- int c = 0; // init for GCC
colnr_T vcol_prev = -1; // "wlv.vcol" of previous character
char *line; // current line
char *ptr; // current position in "line"
@@ -1096,8 +1081,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
int multi_attr = 0; // attributes desired by multibyte
int mb_l = 1; // multi-byte byte length
int mb_c = 0; // decoded multi-byte character
- bool mb_utf8 = false; // screen char is UTF-8 char
- int u8cc[MAX_MCO]; // composing UTF-8 chars
+ schar_T mb_schar; // complete screen char
int change_start = MAXCOL; // first col of changed area
int change_end = -1; // last col of changed area
bool in_multispace = false; // in multiple consecutive spaces
@@ -1951,34 +1935,25 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// For the '$' of the 'list' option, n_extra == 1, p_extra == "".
if (wlv.n_extra > 0) {
if (wlv.c_extra != NUL || (wlv.n_extra == 1 && wlv.c_final != NUL)) {
- c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra;
- mb_c = c; // doesn't handle non-utf-8 multi-byte!
- mb_utf8 = check_mb_utf8(&c, u8cc);
+ mb_c = (wlv.n_extra == 1 && wlv.c_final != NUL) ? wlv.c_final : wlv.c_extra;
+ mb_schar = schar_from_char(mb_c);
+ wlv.n_extra--;
} else {
assert(wlv.p_extra != NULL);
- c = (uint8_t)(*wlv.p_extra);
- mb_c = c;
- // If the UTF-8 character is more than one byte:
- // Decode it into "mb_c".
mb_l = utfc_ptr2len(wlv.p_extra);
- mb_utf8 = false;
- if (mb_l > wlv.n_extra) {
- mb_l = 1;
- } else if (mb_l > 1) {
- mb_c = utfc_ptr2char(wlv.p_extra, u8cc);
- mb_utf8 = true;
- c = 0xc0;
- }
- if (mb_l == 0) { // at the NUL at end-of-line
+ mb_schar = utfc_ptr2schar(wlv.p_extra, &mb_c);
+ // mb_l=0 at the end-of-line NUL
+ if (mb_l > wlv.n_extra || mb_l == 0) {
mb_l = 1;
}
// If a double-width char doesn't fit display a '>' in the last column.
+ // Don't advance the pointer but put the character at the start of the next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
- c = '>';
- mb_c = c;
+ mb_c = '>';
mb_l = 1;
(void)mb_l;
+ mb_schar = schar_from_ascii(mb_c);
multi_attr = win_hl_attr(wp, HLF_AT);
if (wlv.cul_attr) {
@@ -1986,18 +1961,11 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
? hl_combine_attr(wlv.cul_attr, multi_attr)
: hl_combine_attr(multi_attr, wlv.cul_attr);
}
-
- // put the pointer back to output the double-width
- // character at the start of the next line.
- wlv.n_extra++;
- wlv.p_extra--;
} else {
- wlv.n_extra -= mb_l - 1;
- wlv.p_extra += mb_l - 1;
+ wlv.n_extra -= mb_l;
+ wlv.p_extra += mb_l;
}
- wlv.p_extra++;
}
- wlv.n_extra--;
// Only restore search_attr and area_attr after "n_extra" in
// the next screen line is also done.
@@ -2026,58 +1994,40 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
}
} else if (has_fold) {
// skip writing the buffer line itself
- c = NUL;
+ mb_c = NUL;
} else {
- int c0;
char *prev_ptr = ptr;
- // Get a character from the line itself.
- c0 = c = (uint8_t)(*ptr);
- mb_c = c;
-
- if (c == NUL) {
+ // first byte of next char
+ int c0 = (uint8_t)(*ptr);
+ if (c0 == NUL) {
// no more cells to skip
wlv.skip_cells = 0;
}
- // If the UTF-8 character is more than one byte: Decode it
- // into "mb_c".
+ // Get a character from the line itself.
mb_l = utfc_ptr2len(ptr);
- mb_utf8 = false;
- if (mb_l > 1) {
- mb_c = utfc_ptr2char(ptr, u8cc);
- // Overlong encoded ASCII or ASCII with composing char
- // is displayed normally, except a NUL.
- if (mb_c < 0x80) {
- c0 = c = mb_c;
- }
- mb_utf8 = true;
-
- // At start of the line we can have a composing char.
- // Draw it as a space with a composing char.
- if (utf_iscomposing(mb_c)) {
- for (int i = MAX_MCO - 1; i > 0; i--) {
- u8cc[i] = u8cc[i - 1];
- }
- u8cc[0] = mb_c;
- mb_c = ' ';
- }
+ mb_schar = utfc_ptr2schar(ptr, &mb_c);
+
+ // Overlong encoded ASCII or ASCII with composing char
+ // is displayed normally, except a NUL.
+ if (mb_l > 1 && mb_c < 0x80) {
+ c0 = mb_c;
}
- if ((mb_l == 1 && c >= 0x80)
+ if ((mb_l == 1 && c0 >= 0x80)
|| (mb_l >= 1 && mb_c == 0)
|| (mb_l > 1 && (!vim_isprintc(mb_c)))) {
// Illegal UTF-8 byte: display as <xx>.
- // Non-BMP character : display as ? or fullwidth ?.
+ // Non-printable character : display as ? or fullwidth ?.
transchar_hex(wlv.extra, mb_c);
if (wp->w_p_rl) { // reverse
rl_mirror_ascii(wlv.extra, NULL);
}
wlv.p_extra = wlv.extra;
- c = (uint8_t)(*wlv.p_extra);
mb_c = mb_ptr2char_adv((const char **)&wlv.p_extra);
- mb_utf8 = (c >= 0x80);
+ mb_schar = schar_from_char(mb_c);
wlv.n_extra = (int)strlen(wlv.p_extra);
wlv.c_extra = NUL;
wlv.c_final = NUL;
@@ -2093,10 +2043,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// last column; the character is displayed at the start of the
// next line.
if (wlv.col >= grid->cols - 1 && utf_char2cells(mb_c) == 2) {
- c = '>';
- mb_c = c;
- mb_utf8 = false;
+ mb_c = '>';
mb_l = 1;
+ mb_schar = schar_from_ascii(mb_c);
multi_attr = win_hl_attr(wp, HLF_AT);
// Put pointer back so that the character will be
// displayed at the start of the next line.
@@ -2112,15 +2061,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = 1;
wlv.c_extra = MB_FILLER_CHAR;
wlv.c_final = NUL;
- c = ' ';
+ mb_c = ' ';
+ mb_l = 1;
+ mb_schar = schar_from_ascii(mb_c);
if (area_attr == 0 && search_attr == 0) {
wlv.n_attr = wlv.n_extra + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_AT);
saved_attr2 = wlv.char_attr; // save current attr
}
- mb_c = c;
- mb_utf8 = false;
- mb_l = 1;
}
ptr++;
@@ -2159,11 +2107,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// no concealing past the end of the line, it interferes
// with line highlighting.
- if (c == NUL) {
- syntax_flags = 0;
- } else {
- syntax_flags = get_syntax_info(&syntax_seqnr);
- }
+ syntax_flags = (mb_c == 0) ? 0 : get_syntax_info(&syntax_seqnr);
}
if (has_decor && v > 0) {
@@ -2198,7 +2142,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
spell_attr = 0;
// do not calculate cap_col at the end of the line or when
// only white space is following
- if (c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) {
+ if (mb_c != 0 && (*skipwhite(prev_ptr) != NUL) && can_spell) {
char *p;
hlf_T spell_hlf = HLF_COUNT;
v -= mb_l - 1;
@@ -2272,13 +2216,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
//
// So only allow to linebreak, once we have found chars not in
// 'breakat' in the line.
- if (wp->w_p_lbr && !wlv.need_lbr && c != NUL
+ if (wp->w_p_lbr && !wlv.need_lbr && mb_c != NUL
&& !vim_isbreak((uint8_t)(*ptr))) {
wlv.need_lbr = true;
}
// Found last space before word: check for line break.
- if (wp->w_p_lbr && c0 == c && wlv.need_lbr
- && vim_isbreak(c) && !vim_isbreak((uint8_t)(*ptr))) {
+ if (wp->w_p_lbr && c0 == mb_c && mb_c < 128 && wlv.need_lbr
+ && vim_isbreak(mb_c) && !vim_isbreak((uint8_t)(*ptr))) {
int mb_off = utf_head_off(line, ptr - 1);
char *p = ptr - (mb_off + 1);
chartabsize_T cts;
@@ -2289,33 +2233,33 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = win_lbr_chartabsize(&cts, NULL) - 1;
clear_chartabsize_arg(&cts);
- if (on_last_col && c != TAB) {
+ if (on_last_col && mb_c != TAB) {
// Do not continue search/match highlighting over the
// line break, but for TABs the highlighting should
// include the complete width of the character
search_attr = 0;
}
- if (c == TAB && wlv.n_extra + wlv.col > grid->cols) {
+ if (mb_c == TAB && wlv.n_extra + wlv.col > grid->cols) {
wlv.n_extra = tabstop_padding(wlv.vcol, wp->w_buffer->b_p_ts,
wp->w_buffer->b_p_vts_array) - 1;
}
wlv.c_extra = mb_off > 0 ? MB_FILLER_CHAR : ' ';
wlv.c_final = NUL;
- if (ascii_iswhite(c)) {
- if (c == TAB) {
+ if (mb_c < 128 && ascii_iswhite(mb_c)) {
+ if (mb_c == TAB) {
// See "Tab alignment" below.
FIX_FOR_BOGUSCOLS;
}
if (!wp->w_p_list) {
- c = ' ';
+ mb_c = ' ';
+ mb_schar = schar_from_ascii(mb_c);
}
}
}
if (wp->w_p_list) {
- in_multispace = c == ' ' && (*ptr == ' '
- || (prev_ptr > line && prev_ptr[-1] == ' '));
+ in_multispace = mb_c == ' ' && (*ptr == ' ' || (prev_ptr > line && prev_ptr[-1] == ' '));
if (!in_multispace) {
multispace_pos = 0;
}
@@ -2325,61 +2269,56 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// But not when the character is followed by a composing
// character (use mb_l to check that).
if (wp->w_p_list
- && ((((c == 160 && mb_l == 1)
- || (mb_utf8
- && ((mb_c == 160 && mb_l == 2)
- || (mb_c == 0x202f && mb_l == 3))))
+ && ((((mb_c == 160 && mb_l == 2) || (mb_c == 0x202f && mb_l == 3))
&& wp->w_p_lcs_chars.nbsp)
- || (c == ' '
+ || (mb_c == ' '
&& mb_l == 1
&& (wp->w_p_lcs_chars.space
|| (in_multispace && wp->w_p_lcs_chars.multispace != NULL))
&& ptr - line >= leadcol
&& ptr - line <= trailcol))) {
if (in_multispace && wp->w_p_lcs_chars.multispace != NULL) {
- c = wp->w_p_lcs_chars.multispace[multispace_pos++];
+ mb_c = wp->w_p_lcs_chars.multispace[multispace_pos++];
if (wp->w_p_lcs_chars.multispace[multispace_pos] == NUL) {
multispace_pos = 0;
}
} else {
- c = (c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp;
+ mb_c = (mb_c == ' ') ? wp->w_p_lcs_chars.space : wp->w_p_lcs_chars.nbsp;
}
wlv.n_attr = 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr
- mb_c = c;
- mb_utf8 = check_mb_utf8(&c, u8cc);
+ mb_schar = schar_from_char(mb_c);
}
- if (c == ' ' && ((trailcol != MAXCOL && ptr > line + trailcol)
- || (leadcol != 0 && ptr < line + leadcol))) {
+ if (mb_c == ' ' && mb_l == 1 && ((trailcol != MAXCOL && ptr > line + trailcol)
+ || (leadcol != 0 && ptr < line + leadcol))) {
if (leadcol != 0 && in_multispace && ptr < line + leadcol
&& wp->w_p_lcs_chars.leadmultispace != NULL) {
- c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++];
+ mb_c = wp->w_p_lcs_chars.leadmultispace[multispace_pos++];
if (wp->w_p_lcs_chars.leadmultispace[multispace_pos] == NUL) {
multispace_pos = 0;
}
} else if (ptr > line + trailcol && wp->w_p_lcs_chars.trail) {
- c = wp->w_p_lcs_chars.trail;
+ mb_c = wp->w_p_lcs_chars.trail;
} else if (ptr < line + leadcol && wp->w_p_lcs_chars.lead) {
- c = wp->w_p_lcs_chars.lead;
+ mb_c = wp->w_p_lcs_chars.lead;
} else if (leadcol != 0 && wp->w_p_lcs_chars.space) {
- c = wp->w_p_lcs_chars.space;
+ mb_c = wp->w_p_lcs_chars.space;
}
wlv.n_attr = 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr
- mb_c = c;
- mb_utf8 = check_mb_utf8(&c, u8cc);
+ mb_schar = schar_from_char(mb_c);
}
}
// Handling of non-printable characters.
- if (!vim_isprintc(c)) {
+ if (!vim_isprintc(mb_c)) {
// when getting a character from the file, we may have to
// turn it into something else on the way to putting it on the screen.
- if (c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) {
+ if (mb_c == TAB && (!wp->w_p_list || wp->w_p_lcs_chars.tab1)) {
int tab_len = 0;
colnr_T vcol_adjusted = wlv.vcol; // removed showbreak length
char *const sbr = get_showbreak_value(wp);
@@ -2422,7 +2361,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
if (wlv.n_extra > 0) {
len += wlv.n_extra - tab_len;
}
- c = wp->w_p_lcs_chars.tab1;
+ mb_c = wp->w_p_lcs_chars.tab1;
p = get_extra_buf((size_t)len + 1);
memset(p, ' ', (size_t)len);
p[len] = NUL;
@@ -2470,11 +2409,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
}
}
- mb_utf8 = false; // don't draw as UTF-8
if (wp->w_p_list) {
- c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3)
- ? wp->w_p_lcs_chars.tab3
- : wp->w_p_lcs_chars.tab1;
+ mb_c = (wlv.n_extra == 0 && wp->w_p_lcs_chars.tab3)
+ ? wp->w_p_lcs_chars.tab3 : wp->w_p_lcs_chars.tab1;
if (wp->w_p_lbr && wlv.p_extra != NULL && *wlv.p_extra != NUL) {
wlv.c_extra = NUL; // using p_extra from above
} else {
@@ -2484,14 +2421,13 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_attr = tab_len + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_0);
saved_attr2 = wlv.char_attr; // save current attr
- mb_c = c;
- mb_utf8 = check_mb_utf8(&c, u8cc);
} else {
wlv.c_final = NUL;
wlv.c_extra = ' ';
- c = ' ';
+ mb_c = ' ';
}
- } else if (c == NUL
+ mb_schar = schar_from_char(mb_c);
+ } else if (mb_c == NUL
&& (wp->w_p_list
|| ((wlv.fromcol >= 0 || fromcol_prev >= 0)
&& wlv.tocol > wlv.vcol
@@ -2515,20 +2451,19 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_extra = 0;
}
if (wp->w_p_list && wp->w_p_lcs_chars.eol > 0) {
- c = wp->w_p_lcs_chars.eol;
+ mb_c = wp->w_p_lcs_chars.eol;
} else {
- c = ' ';
+ mb_c = ' ';
}
lcs_eol_one = -1;
ptr--; // put it back at the NUL
wlv.extra_attr = win_hl_attr(wp, HLF_AT);
wlv.n_attr = 1;
- mb_c = c;
- mb_utf8 = check_mb_utf8(&c, u8cc);
- } else if (c != NUL) {
- wlv.p_extra = transchar_buf(wp->w_buffer, c);
+ mb_schar = schar_from_char(mb_c);
+ } else if (mb_c != NUL) {
+ wlv.p_extra = transchar_buf(wp->w_buffer, mb_c);
if (wlv.n_extra == 0) {
- wlv.n_extra = byte2cells(c) - 1;
+ wlv.n_extra = byte2cells(mb_c) - 1;
}
if ((dy_flags & DY_UHEX) && wp->w_p_rl) {
rl_mirror_ascii(wlv.p_extra, NULL); // reverse "<12>"
@@ -2538,7 +2473,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
if (wp->w_p_lbr) {
char *p;
- c = (uint8_t)(*wlv.p_extra);
+ mb_c = (uint8_t)(*wlv.p_extra);
p = get_extra_buf((size_t)wlv.n_extra + 1);
memset(p, ' ', (size_t)wlv.n_extra);
strncpy(p, // NOLINT(runtime/printf)
@@ -2547,20 +2482,21 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
p[wlv.n_extra] = NUL;
wlv.p_extra = p;
} else {
- wlv.n_extra = byte2cells(c) - 1;
- c = (uint8_t)(*wlv.p_extra++);
+ wlv.n_extra = byte2cells(mb_c) - 1;
+ mb_c = (uint8_t)(*wlv.p_extra++);
}
wlv.n_attr = wlv.n_extra + 1;
wlv.extra_attr = win_hl_attr(wp, HLF_8);
saved_attr2 = wlv.char_attr; // save current attr
- mb_utf8 = false; // don't draw as UTF-8
+ mb_schar = schar_from_ascii(mb_c);
} else if (VIsual_active
&& (VIsual_mode == Ctrl_V || VIsual_mode == 'v')
&& virtual_active()
&& wlv.tocol != MAXCOL
&& wlv.vcol < wlv.tocol
&& wlv.col < grid->cols) {
- c = ' ';
+ mb_c = ' ';
+ mb_schar = schar_from_char(mb_c);
ptr--; // put it back at the NUL
}
}
@@ -2580,18 +2516,18 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// First time at this concealed item: display one
// character.
if (has_match_conc && match_conc) {
- c = match_conc;
+ mb_c = match_conc;
} else if (decor_conceal && decor_state.conceal_char) {
- c = decor_state.conceal_char;
+ mb_c = decor_state.conceal_char;
if (decor_state.conceal_attr) {
wlv.char_attr = decor_state.conceal_attr;
}
} else if (syn_get_sub_char() != NUL) {
- c = syn_get_sub_char();
+ mb_c = syn_get_sub_char();
} else if (wp->w_p_lcs_chars.conceal != NUL) {
- c = wp->w_p_lcs_chars.conceal;
+ mb_c = wp->w_p_lcs_chars.conceal;
} else {
- c = ' ';
+ mb_c = ' ';
}
prev_syntax_id = syntax_seqnr;
@@ -2610,8 +2546,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
is_concealing = true;
wlv.skip_cells = 1;
}
- mb_c = c;
- mb_utf8 = check_mb_utf8(&c, u8cc);
+ mb_schar = schar_from_char(mb_c);
} else {
prev_syntax_id = 0;
is_concealing = false;
@@ -2654,8 +2589,8 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
&& (wp->w_p_wrap ? (wp->w_skipcol > 0 && wlv.row == 0) : wp->w_leftcol > 0)
&& wlv.filler_todo <= 0
&& wlv.draw_state > WL_STC
- && c != NUL) {
- c = wp->w_p_lcs_chars.prec;
+ && mb_c != NUL) {
+ mb_c = wp->w_p_lcs_chars.prec;
lcs_prec_todo = NUL;
if (utf_char2cells(mb_c) > 1) {
// Double-width character being overwritten by the "precedes"
@@ -2666,15 +2601,14 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
wlv.n_attr = 2;
wlv.extra_attr = win_hl_attr(wp, HLF_AT);
}
- mb_c = c;
- mb_utf8 = check_mb_utf8(&c, u8cc);
+ mb_schar = schar_from_char(mb_c);
saved_attr3 = wlv.char_attr; // save current attr
wlv.char_attr = win_hl_attr(wp, HLF_AT); // overwriting char_attr
n_attr3 = 1;
}
// At end of the text line or just after the last character.
- if (c == NUL && eol_hl_off == 0) {
+ if (mb_c == NUL && eol_hl_off == 0) {
// flag to indicate whether prevcol equals startcol of search_hl or
// one of the matches
bool prevcol_hl_flag = get_prevcol_hl_flag(wp, &screen_search_hl,
@@ -2728,7 +2662,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
}
// At end of the text line.
- if (c == NUL) {
+ if (mb_c == NUL) {
// Highlight 'cursorcolumn' & 'colorcolumn' past end of the line.
if (wp->w_p_wrap) {
v = wlv.startrow == 0 ? wp->w_skipcol : 0;
@@ -2874,10 +2808,9 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
|| lcs_eol_one > 0
|| (wlv.n_extra > 0 && (wlv.c_extra != NUL || *wlv.p_extra != NUL))
|| has_more_inline_virt(&wlv, v)) {
- c = wp->w_p_lcs_chars.ext;
+ mb_c = wp->w_p_lcs_chars.ext;
wlv.char_attr = win_hl_attr(wp, HLF_AT);
- mb_c = c;
- mb_utf8 = check_mb_utf8(&c, u8cc);
+ mb_schar = schar_from_char(mb_c);
}
}
@@ -2923,11 +2856,7 @@ int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool number_onl
// Skip characters that are left of the screen for 'nowrap'.
if (wlv.draw_state < WL_LINE || wlv.skip_cells <= 0) {
// Store the character.
- if (mb_utf8) {
- linebuf_char[wlv.off] = schar_from_cc(mb_c, u8cc);
- } else {
- linebuf_char[wlv.off] = schar_from_ascii((char)c);
- }
+ linebuf_char[wlv.off] = mb_schar;
if (multi_attr) {
linebuf_attr[wlv.off] = multi_attr;
multi_attr = 0;
diff --git a/src/nvim/edit.c b/src/nvim/edit.c
index ce547b55fe..eb5ea2c873 100644
--- a/src/nvim/edit.c
+++ b/src/nvim/edit.c
@@ -1462,7 +1462,7 @@ void edit_putchar(int c, bool highlight)
pc_status = PC_STATUS_SET;
}
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
grid_line_puts(pc_col, buf, utf_char2bytes(c, buf), attr);
grid_line_flush();
}
@@ -2176,7 +2176,7 @@ void insertchar(int c, int flags, int second_indent)
int cc;
if ((cc = utf_char2len(c)) > 1) {
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
utf_char2bytes(c, buf);
buf[cc] = NUL;
@@ -3681,7 +3681,6 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
int cc;
int temp = 0; // init for GCC
bool did_backspace = false;
- int cpc[MAX_MCO]; // composing characters
bool call_fix_indent = false;
// can't delete anything in an empty file
@@ -3910,15 +3909,15 @@ static bool ins_bs(int c, int mode, int *inserted_space_p)
if (State & REPLACE_FLAG) {
replace_do_bs(-1);
} else {
- const int l_p_deco = p_deco;
- if (l_p_deco) {
- (void)utfc_ptr2char(get_cursor_pos_ptr(), cpc);
+ bool has_composing = false;
+ if (p_deco) {
+ char *p0 = get_cursor_pos_ptr();
+ has_composing = utf_composinglike(p0, p0 + utf_ptr2len(p0));
}
(void)del_char(false);
// If there are combining characters and 'delcombine' is set
- // move the cursor back. Don't back up before the base
- // character.
- if (l_p_deco && cpc[0] != NUL) {
+ // move the cursor back. Don't back up before the base character.
+ if (has_composing) {
inc_cursor();
}
if (revins_chars) {
diff --git a/src/nvim/eval.c b/src/nvim/eval.c
index ed70091077..c073f30547 100644
--- a/src/nvim/eval.c
+++ b/src/nvim/eval.c
@@ -7117,7 +7117,7 @@ dict_T *get_vim_var_dict(int idx) FUNC_ATTR_PURE
/// Set v:char to character "c".
void set_vim_var_char(int c)
{
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
buf[utf_char2bytes(c, buf)] = NUL;
set_vim_var_string(VV_CHAR, buf, -1);
diff --git a/src/nvim/eval/funcs.c b/src/nvim/eval/funcs.c
index c6909245af..8ef208f291 100644
--- a/src/nvim/eval/funcs.c
+++ b/src/nvim/eval/funcs.c
@@ -5134,7 +5134,7 @@ static void f_nr2char(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
return;
}
- char buf[MB_MAXBYTES];
+ char buf[MB_MAXCHAR];
const int len = utf_char2bytes((int)num, buf);
rettv->v_type = VAR_STRING;
@@ -6891,7 +6891,7 @@ static void f_screenchar(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
c = -1;
} else {
- char buf[MB_MAXBYTES + 1];
+ char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL));
c = utf_ptr2char(buf);
}
@@ -6907,24 +6907,22 @@ static void f_screenchars(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
ScreenGrid *grid;
screenchar_adjust(&grid, &row, &col);
+ tv_list_alloc_ret(rettv, kListLenMayKnow);
if (row < 0 || row >= grid->rows || col < 0 || col >= grid->cols) {
- tv_list_alloc_ret(rettv, 0);
return;
}
- char buf[MB_MAXBYTES + 1];
+ char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL));
- int pcc[MAX_MCO];
- int c = utfc_ptr2char(buf, pcc);
- int composing_len = 0;
- while (composing_len < MAX_MCO && pcc[composing_len] != 0) {
- composing_len++;
- }
- tv_list_alloc_ret(rettv, composing_len + 1);
- tv_list_append_number(rettv->vval.v_list, c);
- for (int i = 0; i < composing_len; i++) {
- tv_list_append_number(rettv->vval.v_list, pcc[i]);
- }
+
+ // schar values are already processed chars which are always NUL-terminated.
+ // A single [0] is expected when char is NUL.
+ size_t i = 0;
+ do {
+ int c = utf_ptr2char(buf + i);
+ tv_list_append_number(rettv->vval.v_list, c);
+ i += (size_t)utf_ptr2len(buf + i);
+ } while (buf[i] != NUL);
}
/// "screencol()" function
@@ -6957,7 +6955,7 @@ static void f_screenstring(typval_T *argvars, typval_T *rettv, EvalFuncData fptr
return;
}
- char buf[MB_MAXBYTES + 1];
+ char buf[MAX_SCHAR_SIZE + 1];
schar_get(buf, grid_getchar(grid, row, col, NULL));
rettv->vval.v_string = xstrdup(buf);
}
@@ -7413,8 +7411,7 @@ static void f_setcharsearch(typval_T *argvars, typval_T *rettv, EvalFuncData fpt
char *const csearch = tv_dict_get_string(d, "char", false);
if (csearch != NULL) {
- int pcc[MAX_MCO];
- const int c = utfc_ptr2char(csearch, pcc);
+ int c = utf_ptr2char(csearch);
set_last_csearch(c, csearch, utfc_ptr2len(csearch));
}
diff --git a/src/nvim/ex_cmds.c b/src/nvim/ex_cmds.c
index 692b320335..d92be6404b 100644
--- a/src/nvim/ex_cmds.c
+++ b/src/nvim/ex_cmds.c
@@ -131,17 +131,22 @@ static const char e_non_numeric_argument_to_z[]
/// ":ascii" and "ga" implementation
void do_ascii(exarg_T *eap)
{
- char *dig;
- int cc[MAX_MCO];
- int c = utfc_ptr2char(get_cursor_pos_ptr(), cc);
- if (c == NUL) {
+ char *data = get_cursor_pos_ptr();
+ size_t len = (size_t)utfc_ptr2len(data);
+
+ if (len == 0) {
msg("NUL", 0);
return;
}
- size_t iobuff_len = 0;
+ bool need_clear = true;
+ msg_sb_eol();
+ msg_start();
+
+ int c = utf_ptr2char(data);
+ size_t off = 0;
- int ci = 0;
+ // TODO(bfredl): merge this with the main loop
if (c < 0x80) {
if (c == NL) { // NUL is stored as NL.
c = NUL;
@@ -160,46 +165,29 @@ void do_ascii(exarg_T *eap)
char buf2[20];
buf2[0] = NUL;
- dig = get_digraph_for_char(cval);
+ char *dig = get_digraph_for_char(cval);
if (dig != NULL) {
- iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
- sizeof(IObuff) - iobuff_len,
- _("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"),
- transchar(c), buf1, buf2, cval, cval, cval, dig);
+ vim_snprintf(IObuff, sizeof(IObuff),
+ _("<%s>%s%s %d, Hex %02x, Oct %03o, Digr %s"),
+ transchar(c), buf1, buf2, cval, cval, cval, dig);
} else {
- iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
- sizeof(IObuff) - iobuff_len,
- _("<%s>%s%s %d, Hex %02x, Octal %03o"),
- transchar(c), buf1, buf2, cval, cval, cval);
- }
-
- c = cc[ci++];
- }
-
-#define SPACE_FOR_DESC (1 + 1 + 1 + MB_MAXBYTES + 16 + 4 + 3 + 3 + 1)
- // Space for description:
- // - 1 byte for separator (starting from second entry)
- // - 1 byte for "<"
- // - 1 byte for space to draw composing character on (optional, but really
- // mostly required)
- // - up to MB_MAXBYTES bytes for character itself
- // - 16 bytes for raw text ("> , Hex , Octal ").
- // - at least 4 bytes for hexadecimal representation
- // - at least 3 bytes for decimal representation
- // - at least 3 bytes for octal representation
- // - 1 byte for NUL
- //
- // Taking into account MAX_MCO and characters which need 8 bytes for
- // hexadecimal representation, but not taking translation into account:
- // resulting string will occupy less then 400 bytes (conservative estimate).
- //
- // Less then 1000 bytes if translation multiplies number of bytes needed for
- // raw text by 6, so it should always fit into 1025 bytes reserved for IObuff.
+ vim_snprintf(IObuff, sizeof(IObuff),
+ _("<%s>%s%s %d, Hex %02x, Octal %03o"),
+ transchar(c), buf1, buf2, cval, cval, cval);
+ }
+
+ msg_multiline(IObuff, 0, true, &need_clear);
+
+ off += (size_t)utf_ptr2len(data); // needed for overlong ascii?
+ }
// Repeat for combining characters, also handle multiby here.
- while (c >= 0x80 && iobuff_len < sizeof(IObuff) - SPACE_FOR_DESC) {
+ while (off < len) {
+ c = utf_ptr2char(data + off);
+
+ size_t iobuff_len = 0;
// This assumes every multi-byte char is printable...
- if (iobuff_len > 0) {
+ if (off > 0) {
IObuff[iobuff_len++] = ' ';
}
IObuff[iobuff_len++] = '<';
@@ -208,32 +196,30 @@ void do_ascii(exarg_T *eap)
}
iobuff_len += (size_t)utf_char2bytes(c, IObuff + iobuff_len);
- dig = get_digraph_for_char(c);
+ char *dig = get_digraph_for_char(c);
if (dig != NULL) {
- iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
- sizeof(IObuff) - iobuff_len,
- (c < 0x10000
- ? _("> %d, Hex %04x, Oct %o, Digr %s")
- : _("> %d, Hex %08x, Oct %o, Digr %s")),
- c, c, c, dig);
+ vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
+ (c < 0x10000
+ ? _("> %d, Hex %04x, Oct %o, Digr %s")
+ : _("> %d, Hex %08x, Oct %o, Digr %s")),
+ c, c, c, dig);
} else {
- iobuff_len += (size_t)vim_snprintf(IObuff + iobuff_len,
- sizeof(IObuff) - iobuff_len,
- (c < 0x10000
- ? _("> %d, Hex %04x, Octal %o")
- : _("> %d, Hex %08x, Octal %o")),
- c, c, c);
- }
- if (ci == MAX_MCO) {
- break;
+ vim_snprintf(IObuff + iobuff_len, sizeof(IObuff) - iobuff_len,
+ (c < 0x10000
+ ? _("> %d, Hex %04x, Octal %o")
+ : _("> %d, Hex %08x, Octal %o")),
+ c, c, c);
}
- c = cc[ci++];
- }
- if (ci != MAX_MCO && c != 0) {
- xstrlcpy(IObuff + iobuff_len, " ...", sizeof(IObuff) - iobuff_len);
+
+ msg_multiline(IObuff, 0, true, &need_clear);
+
+ off += (size_t)utf_ptr2len(data + off); // needed for overlong ascii?
}
- msg(IObuff, 0);
+ if (need_clear) {
+ msg_clr_eos();
+ }
+ msg_end();
}
/// ":left", ":center" and ":right": align text.
diff --git a/src/nvim/grid.c b/src/nvim/grid.c
index f21b7e3a90..6320abe4ea 100644
--- a/src/nvim/grid.c
+++ b/src/nvim/grid.c
@@ -68,21 +68,6 @@ void grid_adjust(ScreenGrid **grid, int *row_off, int *col_off)
}
}
-/// Put a unicode char, and up to MAX_MCO composing chars, in a screen cell.
-schar_T schar_from_cc(int c, int u8cc[MAX_MCO])
-{
- char buf[MAX_SCHAR_SIZE];
- int len = utf_char2bytes(c, buf);
- for (int i = 0; i < MAX_MCO; i++) {
- if (u8cc[i] == 0) {
- break;
- }
- len += utf_char2bytes(u8cc[i], buf + len);
- }
- buf[len] = 0;
- return schar_from_buf(buf, (size_t)len);
-}
-
schar_T schar_from_str(char *str)
{
if (str == NULL) {
@@ -243,22 +228,21 @@ void line_do_arabic_shape(schar_T *buf, int cols)
schar_get(scbuf, buf[i]);
char scbuf_new[MAX_SCHAR_SIZE];
- int len = utf_char2bytes(c0new, scbuf_new);
+ size_t len = (size_t)utf_char2bytes(c0new, scbuf_new);
if (c1new) {
- len += utf_char2bytes(c1new, scbuf_new + len);
+ len += (size_t)utf_char2bytes(c1new, scbuf_new + len);
}
int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0);
size_t rest = strlen(scbuf + off);
- if (rest + (size_t)off + 1 > MAX_SCHAR_SIZE) {
- // TODO(bfredl): this cannot happen just yet, as we only construct
- // schar_T values with up to MAX_MCO+1 composing codepoints. When code
- // is improved so that MAX_SCHAR_SIZE becomes the only/sharp limit,
- // we need be able to peel off a composing char which doesn't fit anymore.
- abort();
+ if (rest + len + 1 > MAX_SCHAR_SIZE) {
+ // Too bigly, discard one code-point.
+ // This should be enough as c0 cannot grow more than from 2 to 4 bytes
+ // (base arabic to extended arabic)
+ rest -= (size_t)utf_cp_head_off(scbuf + off, scbuf + off + rest - 1) + 1;
}
memcpy(scbuf_new + len, scbuf + off, rest);
- buf[i] = schar_from_buf(scbuf_new, (size_t)len + rest);
+ buf[i] = schar_from_buf(scbuf_new, len + rest);
next:
c0prev = c0;
@@ -289,9 +273,9 @@ static bool grid_invalid_row(ScreenGrid *grid, int row)
return grid->attrs[grid->line_offset[row]] < 0;
}
-/// Get a single character directly from grid.chars into "bytes", which must
-/// have a size of "MB_MAXBYTES + 1".
-/// If "attrp" is not NULL, return the character's attribute in "*attrp".
+/// Get a single character directly from grid.chars
+///
+/// @param[out] attrp set to the character's attribute (optional)
schar_T grid_getchar(ScreenGrid *grid, int row, int col, int *attrp)
{
grid_adjust(&grid, &row, &col);
@@ -385,42 +369,35 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
{
const char *ptr = text;
int len = textlen;
- int u8cc[MAX_MCO];
assert(grid_line_grid);
int start_col = col;
int max_col = grid_line_maxcol;
- while (col < max_col
- && (len < 0 || (int)(ptr - text) < len)
- && *ptr != NUL) {
+ while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) {
// check if this is the first byte of a multibyte
int mbyte_blen = len > 0
? utfc_ptr2len_len(ptr, (int)((text + len) - ptr))
: utfc_ptr2len(ptr);
- int u8c = len >= 0
- ? utfc_ptr2char_len(ptr, u8cc, (int)((text + len) - ptr))
- : utfc_ptr2char(ptr, u8cc);
- int mbyte_cells = utf_char2cells(u8c);
+ int firstc;
+ schar_T schar = len >= 0
+ ? utfc_ptr2schar_len(ptr, (int)((text + len) - ptr), &firstc)
+ : utfc_ptr2schar(ptr, &firstc);
+ int mbyte_cells = utf_char2cells(firstc);
if (mbyte_cells > 2) {
mbyte_cells = 1;
- u8c = 0xFFFD;
- u8cc[0] = 0;
+
+ schar = schar_from_char(0xFFFD);
}
if (col + mbyte_cells > max_col) {
// Only 1 cell left, but character requires 2 cells:
// display a '>' in the last column to avoid wrapping. */
- u8c = '>';
- u8cc[0] = 0;
+ schar = schar_from_ascii('>');
mbyte_cells = 1;
}
- schar_T buf;
- // TODO(bfredl): why not just keep the original byte sequence.
- buf = schar_from_cc(u8c, u8cc);
-
// When at the start of the text and overwriting the right half of a
// two-cell character in the same grid, truncate that into a '>'.
if (ptr == text && col > grid_line_first && col < grid_line_last
@@ -428,7 +405,7 @@ int grid_line_puts(int col, const char *text, int textlen, int attr)
linebuf_char[col - 1] = schar_from_ascii('>');
}
- linebuf_char[col] = buf;
+ linebuf_char[col] = schar;
linebuf_attr[col] = attr;
linebuf_vcol[col] = -1;
if (mbyte_cells == 2) {
diff --git a/src/nvim/grid_defs.h b/src/nvim/grid_defs.h
index 11e736fc0c..3cc2d788d3 100644
--- a/src/nvim/grid_defs.h
+++ b/src/nvim/grid_defs.h
@@ -7,8 +7,8 @@
#include "nvim/pos.h"
#include "nvim/types.h"
-#define MAX_MCO 6 // fixed value for 'maxcombine'
-// Includes final NUL. at least 4*(MAX_MCO+1)+1
+// Includes final NUL. MAX_MCO is no longer used, but at least 4*(MAX_MCO+1)+1=29
+// ensures we can fit all composed chars which did fit before.
#define MAX_SCHAR_SIZE 32
// if data[0] is 0xFF, then data[1..4] is a 24-bit index (in machine endianness)
@@ -35,7 +35,7 @@ enum {
/// we can avoid sending bigger updates than necessary to the Ul layer.
///
/// Screen cells are stored as NUL-terminated UTF-8 strings, and a cell can
-/// contain up to MAX_MCO composing characters after the base character.
+/// contain composing characters as many as fits in MAX_SCHAR_SIZE-1 bytes
/// The composing characters are to be drawn on top of the original character.
/// The content after the NUL is not defined (so comparison must be done a
/// single cell at a time). Double-width characters are stored in the left cell,
diff --git a/src/nvim/insexpand.c b/src/nvim/insexpand.c
index f565d5b9e8..adbd2a5315 100644
--- a/src/nvim/insexpand.c
+++ b/src/nvim/insexpand.c
@@ -1743,7 +1743,7 @@ void ins_compl_addleader(int c)
return;
}
if ((cc = utf_char2len(c)) > 1) {
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
utf_char2bytes(c, buf);
buf[cc] = NUL;
diff --git a/src/nvim/lua/stdlib.c b/src/nvim/lua/stdlib.c
index 5072d14c0e..a200b0a32f 100644
--- a/src/nvim/lua/stdlib.c
+++ b/src/nvim/lua/stdlib.c
@@ -224,7 +224,7 @@ static int nlua_str_utf_start(lua_State *const lstate) FUNC_ATTR_NONNULL_ALL
if (offset < 0 || offset > (intptr_t)s1_len) {
return luaL_error(lstate, "index out of range");
}
- int head_offset = utf_cp_head_off(s1, s1 + offset - 1);
+ int head_offset = -utf_cp_head_off(s1, s1 + offset - 1);
lua_pushinteger(lstate, head_offset);
return 1;
}
diff --git a/src/nvim/match.c b/src/nvim/match.c
index 3420455e5f..0cd0426cff 100644
--- a/src/nvim/match.c
+++ b/src/nvim/match.c
@@ -939,7 +939,7 @@ void f_getmatches(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
tv_dict_add_nr(dict, S_LEN("id"), (varnumber_T)cur->mit_id);
if (cur->mit_conceal_char) {
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
buf[utf_char2bytes(cur->mit_conceal_char, buf)] = NUL;
tv_dict_add_str(dict, S_LEN("conceal"), buf);
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 0d468889a4..3a13aeddb8 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -48,6 +48,7 @@
#include "nvim/getchar.h"
#include "nvim/gettext.h"
#include "nvim/globals.h"
+#include "nvim/grid.h"
#include "nvim/grid_defs.h"
#include "nvim/iconv.h"
#include "nvim/keycodes.h"
@@ -722,80 +723,68 @@ bool utf_composinglike(const char *p1, const char *p2)
return arabic_combine(utf_ptr2char(p1), c2);
}
-/// Convert a UTF-8 string to a wide character
+/// Get the screen char at the beginning of a string
///
-/// Also gets up to #MAX_MCO composing characters.
+/// Caller is expected to check for things like unprintable chars etc
+/// If first char in string is a composing char, prepend a space to display it correctly.
///
-/// @param[out] pcc Location where to store composing characters. Must have
-/// space at least for #MAX_MCO + 1 elements.
+/// If "p" starts with an invalid sequence, zero is returned.
///
-/// @return leading character.
-int utfc_ptr2char(const char *p, int *pcc)
+/// @param[out] firstc (required) The first codepoint of the screen char,
+/// or the first byte of an invalid sequence
+///
+/// @return the char
+schar_T utfc_ptr2schar(const char *p, int *firstc)
+ FUNC_ATTR_NONNULL_ALL
{
- int i = 0;
-
int c = utf_ptr2char(p);
- int len = utf_ptr2len(p);
-
- // Only accept a composing char when the first char isn't illegal.
- if ((len > 1 || (uint8_t)(*p) < 0x80)
- && (uint8_t)p[len] >= 0x80
- && utf_composinglike(p, p + len)) {
- int cc = utf_ptr2char(p + len);
- while (true) {
- pcc[i++] = cc;
- if (i == MAX_MCO) {
- break;
- }
- len += utf_ptr2len(p + len);
- if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
- break;
- }
- }
- }
+ *firstc = c; // NOT optional, you are gonna need it
+ bool first_compose = utf_iscomposing(c);
+ size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+ size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
- if (i < MAX_MCO) { // last composing char must be 0
- pcc[i] = 0;
+ if (len == 1 && (uint8_t)(*p) >= 0x80) {
+ return 0; // invalid sequence
}
- return c;
+ return schar_from_buf_first(p, len, first_compose);
}
-// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
-// composing characters. Use no more than p[maxlen].
-//
-// @param [out] pcc: composing chars, last one is 0
-int utfc_ptr2char_len(const char *p, int *pcc, int maxlen)
+/// Get the screen char at the beginning of a string with length
+///
+/// Like utfc_ptr2schar but use no more than p[maxlen].
+schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+ FUNC_ATTR_NONNULL_ALL
{
assert(maxlen > 0);
- int i = 0;
+ size_t len = (size_t)utf_ptr2len_len(p, maxlen);
+ if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+ // invalid or truncated sequence
+ *firstc = (uint8_t)(*p);
+ return 0;
+ }
- int len = utf_ptr2len_len(p, maxlen);
- // Is it safe to use utf_ptr2char()?
- bool safe = len > 1 && len <= maxlen;
- int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
+ int c = utf_ptr2char(p);
+ *firstc = c;
+ bool first_compose = utf_iscomposing(c);
+ maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
+ len = (size_t)utfc_ptr2len_len(p, maxlen);
- // Only accept a composing char when the first char isn't illegal.
- if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
- for (; i < MAX_MCO; i++) {
- int len_cc = utf_ptr2len_len(p + len, maxlen - len);
- safe = len_cc > 1 && len_cc <= maxlen - len;
- if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
- || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
- break;
- }
- len += len_cc;
- }
- }
+ return schar_from_buf_first(p, len, first_compose);
+}
- if (i < MAX_MCO) {
- // last composing char must be 0
- pcc[i] = 0;
+/// Caller must ensure there is space for `first_compose`
+static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
+{
+ if (first_compose) {
+ char cbuf[MAX_SCHAR_SIZE];
+ cbuf[0] = ' ';
+ memcpy(cbuf + 1, buf, len);
+ return schar_from_buf(cbuf, len + 1);
+ } else {
+ return schar_from_buf(buf, len);
}
-
- return c;
-#undef ISCOMPOSING
}
/// Get the length of a UTF-8 byte sequence representing a single codepoint
@@ -878,8 +867,7 @@ int utfc_ptr2len(const char *const p)
return 1;
}
- // Check for composing characters. We can handle only the first six, but
- // skip all of them (otherwise the cursor would get stuck).
+ // Check for composing characters.
int prevlen = 0;
while (true) {
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
@@ -1815,12 +1803,12 @@ int utf_cp_tail_off(const char *base, const char *p_in)
/// Return the offset from "p" to the first byte of the codepoint it points
/// to. Can start anywhere in a stream of bytes.
/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately and returns a negative offset.
+/// separately.
///
/// @param[in] base Pointer to start of string
/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
//
-/// @return 0 if invalid sequence, else offset to previous codepoint
+/// @return 0 if invalid sequence, else number of bytes to previous codepoint
int utf_cp_head_off(const char *base, const char *p)
{
int i;
@@ -1830,17 +1818,20 @@ int utf_cp_head_off(const char *base, const char *p)
}
// Find the first character that is not 10xx.xxxx
- for (i = 0; p - i > base; i--) {
- if (((uint8_t)p[i] & 0xc0) != 0x80) {
+ for (i = 0; p - i >= base; i++) {
+ if (((uint8_t)p[-i] & 0xc0) != 0x80) {
break;
}
}
- // Find the last character that is 10xx.xxxx
- for (int j = 0; ((uint8_t)p[j + 1] & 0xc0) == 0x80; j++) {}
+ // Find the last character that is 10xx.xxxx (condition terminates on NUL)
+ int j = 1;
+ while (((uint8_t)p[j] & 0xc0) == 0x80) {
+ j++;
+ }
// Check for illegal sequence.
- if (utf8len_tab[(uint8_t)p[i]] == 1) {
+ if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
return 0;
}
return i;
diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h
index 1d1a9439ad..c177f14ce2 100644
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@@ -7,6 +7,7 @@
#include "nvim/cmdexpand_defs.h"
#include "nvim/eval/typval_defs.h"
#include "nvim/func_attr.h"
+#include "nvim/grid_defs.h"
#include "nvim/mbyte_defs.h"
#include "nvim/os/os_defs.h"
#include "nvim/types.h"
diff --git a/src/nvim/message.c b/src/nvim/message.c
index ee1a9e60b0..9e9aa1fcd6 100644
--- a/src/nvim/message.c
+++ b/src/nvim/message.c
@@ -139,7 +139,7 @@ static int msg_grid_pos_at_flush = 0;
static void ui_ext_msg_set_pos(int row, bool scrolled)
{
- char buf[MAX_MCO + 1];
+ char buf[MB_MAXCHAR + 1];
size_t size = (size_t)utf_char2bytes(curwin->w_p_fcs_chars.msgsep, buf);
buf[size] = '\0';
ui_call_msg_set_pos(msg_grid.handle, row, scrolled,
@@ -1471,7 +1471,7 @@ void msg_putchar(int c)
void msg_putchar_attr(int c, int attr)
{
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
if (IS_SPECIAL(c)) {
buf[0] = (char)K_SPECIAL;
@@ -1560,12 +1560,6 @@ int msg_outtrans_len(const char *msgstr, int len, int attr)
mode_displayed = false;
}
- // If the string starts with a composing character first draw a space on
- // which the composing char can be drawn.
- if (utf_iscomposing(utf_ptr2char(msgstr))) {
- msg_puts_attr(" ", attr);
- }
-
// Go over the string. Special characters are translated and printed.
// Normal characters are printed several at a time.
while (--len >= 0 && !got_int) {
diff --git a/src/nvim/option_vars.h b/src/nvim/option_vars.h
index f0c752a2b1..0193e43de7 100644
--- a/src/nvim/option_vars.h
+++ b/src/nvim/option_vars.h
@@ -556,6 +556,7 @@ EXTERN char *p_mp; ///< 'makeprg'
EXTERN char *p_mps; ///< 'matchpairs'
EXTERN OptInt p_mat; ///< 'matchtime'
EXTERN OptInt p_mco; ///< 'maxcombine'
+#define MAX_MCO 6 // fixed value for 'maxcombine'
EXTERN OptInt p_mfd; ///< 'maxfuncdepth'
EXTERN OptInt p_mmd; ///< 'maxmapdepth'
EXTERN OptInt p_mmp; ///< 'maxmempattern'
diff --git a/src/nvim/spellsuggest.c b/src/nvim/spellsuggest.c
index 15a58a4434..dab278e383 100644
--- a/src/nvim/spellsuggest.c
+++ b/src/nvim/spellsuggest.c
@@ -3019,7 +3019,7 @@ static int soundfold_find(slang_T *slang, char *word)
static bool similar_chars(slang_T *slang, int c1, int c2)
{
int m1, m2;
- char buf[MB_MAXBYTES + 1];
+ char buf[MB_MAXCHAR + 1];
hashitem_T *hi;
if (c1 >= 256) {
diff --git a/test/functional/ui/fold_spec.lua b/test/functional/ui/fold_spec.lua
index 9a0182ea29..1addf7088e 100644
--- a/test/functional/ui/fold_spec.lua
+++ b/test/functional/ui/fold_spec.lua
@@ -1102,8 +1102,6 @@ describe("folded lines", function()
end)
it("works with multibyte text", function()
- -- Currently the only allowed value of 'maxcombine'
- eq(6, meths.get_option_value('maxcombine', {}))
eq(true, meths.get_option_value('arabicshape', {}))
insert([[
å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢͟ العَرَبِيَّة
@@ -1120,7 +1118,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ |
+ å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ |
möre tex^t |
{1:~ }|
{1:~ }|
@@ -1132,7 +1130,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ |
+ å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ |
möre tex^t |
{1:~ }|
{1:~ }|
@@ -1156,7 +1154,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
+ {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@@ -1168,7 +1166,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
+ {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ ﺎﻠﻋَﺮَﺒِﻳَّﺓ·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@@ -1192,7 +1190,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}|
+ {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@@ -1204,7 +1202,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- {5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة·················}|
+ {5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة·················}|
{1:~ }|
{1:~ }|
{1:~ }|
@@ -1228,7 +1226,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}|
+ {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
{1:~ }|
{1:~ }|
{1:~ }|
@@ -1240,7 +1238,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̎͂̀̂͛͛ العَرَبِيَّة···········}|
+ {7:+ }{8: 1 }{5:^+-- 2 lines: å 语 x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ العَرَبِيَّة···········}|
{1:~ }|
{1:~ }|
{1:~ }|
@@ -1265,7 +1263,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}{8: 1 }{7: +}|
+ {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}{8: 1 }{7: +}|
{1: ~}|
{1: ~}|
{1: ~}|
@@ -1277,7 +1275,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- {5:···········ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}{8: 1 }{7: +}|
+ {5:···········ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}{8: 1 }{7: +}|
{1: ~}|
{1: ~}|
{1: ~}|
@@ -1301,7 +1299,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}|
+ {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@@ -1313,7 +1311,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- {5:·················ةيَّبِرَعَلا x̎͂̀̂͛͛ 语 å :senil 2 --^+}|
+ {5:·················ةيَّبِرَعَلا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@@ -1337,7 +1335,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2 --^+}|
+ {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@@ -1349,7 +1347,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̎͂̀̂͛͛ 语 å :senil 2 --^+}|
+ {5:·················ﺔﻴَّﺑِﺮَﻌَﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å :senil 2 --^+}|
{1: ~}|
{1: ~}|
{1: ~}|
@@ -1373,7 +1371,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å|
+ ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
txet eröm|
{1: ~}|
{1: ~}|
@@ -1385,7 +1383,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̎͂̀̂͛͛ 语 å|
+ ﺔﻴَّﺑِﺮَﻌَ^ﻟﺍ x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
txet eröm|
{1: ~}|
{1: ~}|
@@ -1409,7 +1407,7 @@ describe("folded lines", function()
[2:---------------------------------------------]|
[3:---------------------------------------------]|
## grid 2
- ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å|
+ ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
txet eröm|
{1: ~}|
{1: ~}|
@@ -1421,7 +1419,7 @@ describe("folded lines", function()
]])
else
screen:expect([[
- ةيَّبِرَعَ^لا x̎͂̀̂͛͛ 语 å|
+ ةيَّبِرَعَ^لا x̨̣̘̫̲͚͎̎͂̀̂͛͛̾͢ 语 å|
txet eröm|
{1: ~}|
{1: ~}|
diff --git a/test/functional/ui/multibyte_spec.lua b/test/functional/ui/multibyte_spec.lua
index 077dd1a779..d72bf27d6b 100644
--- a/test/functional/ui/multibyte_spec.lua
+++ b/test/functional/ui/multibyte_spec.lua
@@ -228,6 +228,36 @@ describe("multibyte rendering", function()
]]}
end)
+
+ it('works with arabicshape and multiple composing chars', function()
+ -- this tests an important edge case: arabicshape might increase the byte size of the base
+ -- character in a way so that the last composing char no longer fits. use "g8" on the text
+ -- to observe what is happening (the final E1 80 B7 gets deleted with 'arabicshape')
+ -- If we would increase the schar_t size, say from 32 to 64 bytes, we need to extend the
+ -- test text with even more zalgo energy to still touch this edge case.
+
+ meths.buf_set_lines(0,0,-1,true, {"سلام့̀́̂̃̄̅̆̇̈̉̊̋̌"})
+ command('set noarabicshape')
+
+ screen:expect{grid=[[
+ ^سلام့̀́̂̃̄̅̆̇̈̉̊̋̌ |
+ {1:~ }|
+ {1:~ }|
+ {1:~ }|
+ {1:~ }|
+ |
+ ]]}
+
+ command('set arabicshape')
+ screen:expect{grid=[[
+ ^ﺱﻼﻣ̀́̂̃̄̅̆̇̈̉̊̋̌ |
+ {1:~ }|
+ {1:~ }|
+ {1:~ }|
+ {1:~ }|
+ |
+ ]]}
+ end)
end)
describe('multibyte rendering: statusline', function()
diff --git a/test/functional/ui/output_spec.lua b/test/functional/ui/output_spec.lua
index 0dd1f0325c..7b93b74eac 100644
--- a/test/functional/ui/output_spec.lua
+++ b/test/functional/ui/output_spec.lua
@@ -225,8 +225,8 @@ describe("shell command :!", function()
å |
ref: å̲ |
1: å̲ |
- 2: å ̲ |
- 3: å ̲ |
+ 2: å ̲ |
+ 3: å ̲ |
|
{3:Press ENTER or type command to continue}^ |
]])
diff --git a/test/unit/mbyte_spec.lua b/test/unit/mbyte_spec.lua
index fdb1bceab0..cd94624570 100644
--- a/test/unit/mbyte_spec.lua
+++ b/test/unit/mbyte_spec.lua
@@ -4,17 +4,9 @@ local itp = helpers.gen_itp(it)
local ffi = helpers.ffi
local eq = helpers.eq
-local mbyte = helpers.cimport("./src/nvim/mbyte.h")
-local charset = helpers.cimport('./src/nvim/charset.h')
+local lib = helpers.cimport('./src/nvim/mbyte.h', './src/nvim/charset.h', './src/nvim/grid.h')
describe('mbyte', function()
- -- Array for composing characters
- local intp = ffi.typeof('int[?]')
- local function to_intp()
- -- how to get MAX_MCO from globals.h?
- return intp(7, 1)
- end
-
-- Convert from bytes to string
local function to_string(bytes)
local s = {}
@@ -30,14 +22,14 @@ describe('mbyte', function()
itp('utf_ptr2char', function()
-- For strings with length 1 the first byte is returned.
for c = 0, 255 do
- eq(c, mbyte.utf_ptr2char(to_string({c, 0})))
+ eq(c, lib.utf_ptr2char(to_string({c, 0})))
end
-- Some ill formed byte sequences that should not be recognized as UTF-8
-- First byte: 0xc0 or 0xc1
-- Second byte: 0x80 .. 0xbf
- --eq(0x00c0, mbyte.utf_ptr2char(to_string({0xc0, 0x80})))
- --eq(0x00c1, mbyte.utf_ptr2char(to_string({0xc1, 0xbf})))
+ --eq(0x00c0, lib.utf_ptr2char(to_string({0xc0, 0x80})))
+ --eq(0x00c1, lib.utf_ptr2char(to_string({0xc1, 0xbf})))
--
-- Sequences with more than four bytes
end)
@@ -47,240 +39,133 @@ describe('mbyte', function()
local char_p = ffi.typeof('char[?]')
for c = n * 0x1000, n * 0x1000 + 0xFFF do
local p = char_p(4, 0)
- mbyte.utf_char2bytes(c, p)
- eq(c, mbyte.utf_ptr2char(p))
- eq(charset.vim_iswordc(c), charset.vim_iswordp(p))
+ lib.utf_char2bytes(c, p)
+ eq(c, lib.utf_ptr2char(p))
+ eq(lib.vim_iswordc(c), lib.vim_iswordp(p))
end
end)
end
- describe('utfc_ptr2char_len', function()
+ describe('utfc_ptr2schar_len', function()
+ local function test_seq(seq)
+ local firstc = ffi.new("int[1]")
+ local buf = ffi.new("char[32]")
+ lib.schar_get(buf, lib.utfc_ptr2schar_len(to_string(seq), #seq, firstc))
+ return {ffi.string(buf), firstc[0]}
+ end
+
+ local function byte(val)
+ return {string.char(val), val}
+ end
itp('1-byte sequences', function()
- local pcc = to_intp()
- for c = 0, 255 do
- eq(c, mbyte.utfc_ptr2char_len(to_string({c}), pcc, 1))
- eq(0, pcc[0])
+ eq({'', 0}, test_seq{0})
+ for c = 1, 127 do
+ eq(byte(c), test_seq{c})
+ end
+ for c = 128, 255 do
+ eq({'', c}, test_seq{c})
end
end)
itp('2-byte sequences', function()
- local pcc = to_intp()
-- No combining characters
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f}), pcc, 2))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0x7f})
-- No combining characters
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80}), pcc, 2))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0x80})
-- No UTF-8 sequence
- pcc = to_intp()
- eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f}), pcc, 2))
- eq(0, pcc[0])
+ eq({'', 0xc2}, test_seq{0xc2, 0x7f})
-- One UTF-8 character
- pcc = to_intp()
- eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80}), pcc, 2))
- eq(0, pcc[0])
+ eq({'\xc2\x80', 0x80}, test_seq{0xc2, 0x80})
-- No UTF-8 sequence
- pcc = to_intp()
- eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0xc0}), pcc, 2))
- eq(0, pcc[0])
+ eq({'', 0xc2}, test_seq{0xc2, 0xc0})
end)
itp('3-byte sequences', function()
- local pcc = to_intp()
-
-- No second UTF-8 character
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x80, 0x80}), pcc, 3))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0x80, 0x80})
-- No combining character
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0x80}), pcc, 3))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0xc2, 0x80})
-- Combining character is U+0300
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80}), pcc, 3))
- eq(0x0300, pcc[0])
- eq(0x0000, pcc[1])
+ eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80})
-- No UTF-8 sequence
- pcc = to_intp()
- eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc}), pcc, 3))
- eq(0, pcc[0])
+ eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc})
-- Incomplete combining character
- pcc = to_intp()
- eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc}), pcc, 3))
- eq(0, pcc[0])
+ eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc})
- -- One UTF-8 character
- pcc = to_intp()
- eq(0x20d0, mbyte.utfc_ptr2char_len(to_string({0xe2, 0x83, 0x90}), pcc, 3))
- eq(0, pcc[0])
+ -- One UTF-8 character (composing only)
+ eq({" \xe2\x83\x90", 0x20d0}, test_seq{0xe2, 0x83, 0x90})
end)
itp('4-byte sequences', function()
- local pcc = to_intp()
-- No following combining character
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80}), pcc, 4))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80})
-- No second UTF-8 character
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80}), pcc, 4))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80})
-- Combining character U+0300
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 4))
- eq(0x0300, pcc[0])
- eq(0x0000, pcc[1])
+ eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc})
-- No UTF-8 sequence
- pcc = to_intp()
- eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80}), pcc, 4))
- eq(0, pcc[0])
+ eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80})
-- No following UTF-8 character
- pcc = to_intp()
- eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc}), pcc, 4))
- eq(0, pcc[0])
+ eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc})
-- Combining character U+0301
- pcc = to_intp()
- eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81}), pcc, 4))
- eq(0x0301, pcc[0])
- eq(0x0000, pcc[1])
+ eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81})
-- One UTF-8 character
- pcc = to_intp()
- eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80}), pcc, 4))
- eq(0, pcc[0])
+ eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80})
end)
itp('5+-byte sequences', function()
- local pcc = to_intp()
-
-- No following combining character
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0x7f, 0xcc, 0x80, 0x80})
-- No second UTF-8 character
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xc2, 0xcc, 0x80, 0x80}), pcc, 5))
- eq(0, pcc[0])
+ eq(byte(0x7f), test_seq{0x7f, 0xc2, 0xcc, 0x80, 0x80})
-- Combining character U+0300
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc}), pcc, 5))
- eq(0x0300, pcc[0])
- eq(0x0000, pcc[1])
+ eq({"\x7f\xcc\x80", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x00})
-- Combining characters U+0300 and U+0301
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81}), pcc, 5))
- eq(0x0300, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0000, pcc[2])
+ eq({"\x7f\xcc\x80\xcc\x81", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81})
-- Combining characters U+0300, U+0301, U+0302
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82}), pcc, 7))
- eq(0x0300, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0302, pcc[2])
- eq(0x0000, pcc[3])
+ eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82})
-- Combining characters U+0300, U+0301, U+0302, U+0303
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string({0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83}), pcc, 9))
- eq(0x0300, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0302, pcc[2])
- eq(0x0303, pcc[3])
- eq(0x0000, pcc[4])
+ eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83})
-- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
- {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84}), pcc, 11))
- eq(0x0300, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0302, pcc[2])
- eq(0x0303, pcc[3])
- eq(0x0304, pcc[4])
- eq(0x0000, pcc[5])
- -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
- -- U+0305
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
- {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
- eq(0x0300, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0302, pcc[2])
- eq(0x0303, pcc[3])
- eq(0x0304, pcc[4])
- eq(0x0305, pcc[5])
- eq(1, pcc[6])
-
- -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304,
- -- U+0305, U+0306, but only save six (= MAX_MCO).
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
- {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86}), pcc, 15))
- eq(0x0300, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0302, pcc[2])
- eq(0x0303, pcc[3])
- eq(0x0304, pcc[4])
- eq(0x0305, pcc[5])
- eq(0x0001, pcc[6])
+ eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84})
+ -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305
+ eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85})
- -- Only three following combining characters U+0300, U+0301, U+0302
- pcc = to_intp()
- eq(0x007f, mbyte.utfc_ptr2char_len(to_string(
- {0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85}), pcc, 13))
- eq(0x0300, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0302, pcc[2])
- eq(0x0000, pcc[3])
+ -- Combining characters U+0300, U+0301, U+0302, U+0303, U+0304, U+0305, U+0306
+ eq({"\x7f\xcc\x80\xcc\x81\xcc\x82\xcc\x83\xcc\x84\xcc\x85\xcc\x86", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xcc, 0x83, 0xcc, 0x84, 0xcc, 0x85, 0xcc, 0x86})
+ -- Only three following combining characters U+0300, U+0301, U+0302
+ eq({"\x7f\xcc\x80\xcc\x81\xcc\x82", 0x7f}, test_seq{0x7f, 0xcc, 0x80, 0xcc, 0x81, 0xcc, 0x82, 0xc2, 0x80, 0xcc, 0x84, 0xcc, 0x85})
-- No UTF-8 sequence
- pcc = to_intp()
- eq(0x00c2, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x7f, 0xcc, 0x80, 0x80}), pcc, 5))
- eq(0, pcc[0])
+ eq({'', 0xc2}, test_seq{0xc2, 0x7f, 0xcc, 0x80, 0x80})
-- No following UTF-8 character
- pcc = to_intp()
- eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0xcc, 0x80}), pcc, 5))
- eq(0, pcc[0])
+ eq({"\xc2\x80", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0xcc, 0x80})
-- Combining character U+0301
- pcc = to_intp()
- eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0x7f}), pcc, 5))
- eq(0x0301, pcc[0])
- eq(0x0000, pcc[1])
+ eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0x7f})
-- Combining character U+0301
- pcc = to_intp()
- eq(0x0080, mbyte.utfc_ptr2char_len(to_string({0xc2, 0x80, 0xcc, 0x81, 0xcc}), pcc, 5))
- eq(0x0301, pcc[0])
- eq(0x0000, pcc[1])
+ eq({"\xc2\x80\xcc\x81", 0x80}, test_seq{0xc2, 0x80, 0xcc, 0x81, 0xcc})
-- One UTF-8 character
- pcc = to_intp()
- eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x7f}), pcc, 5))
- eq(0, pcc[0])
+ eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x7f})
-- One UTF-8 character
- pcc = to_intp()
- eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0x80}), pcc, 5))
- eq(0, pcc[0])
+ eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0x80})
-- One UTF-8 character
- pcc = to_intp()
- eq(0x100000, mbyte.utfc_ptr2char_len(to_string({0xf4, 0x80, 0x80, 0x80, 0xcc}), pcc, 5))
- eq(0, pcc[0])
+ eq({"\xf4\x80\x80\x80", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xcc})
-- Combining characters U+1AB0 and U+0301
- pcc = to_intp()
- eq(0x100000, mbyte.utfc_ptr2char_len(to_string(
- {0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81}), pcc, 9))
- eq(0x1ab0, pcc[0])
- eq(0x0301, pcc[1])
- eq(0x0000, pcc[2])
+ eq({"\xf4\x80\x80\x80\xe1\xaa\xb0\xcc\x81", 0x100000}, test_seq{0xf4, 0x80, 0x80, 0x80, 0xe1, 0xaa, 0xb0, 0xcc, 0x81})
end)
end)