diff options
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r-- | src/nvim/mbyte.c | 331 |
1 files changed, 256 insertions, 75 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 223b4d6845..b874f0dc94 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -39,6 +39,7 @@ #include "nvim/arabic.h" #include "nvim/charset.h" #include "nvim/cursor.h" +#include "nvim/drawscreen.h" #include "nvim/eval.h" #include "nvim/fileio.h" #include "nvim/func_attr.h" @@ -49,7 +50,6 @@ #include "nvim/memline.h" #include "nvim/memory.h" #include "nvim/message.h" -#include "nvim/option.h" #include "nvim/os/os.h" #include "nvim/path.h" #include "nvim/screen.h" @@ -74,6 +74,19 @@ struct interval { # include "unicode_tables.generated.h" #endif +static char e_list_item_nr_is_not_list[] + = N_("E1109: List item %d is not a List"); +static char e_list_item_nr_does_not_contain_3_numbers[] + = N_("E1110: List item %d does not contain 3 numbers"); +static char e_list_item_nr_range_invalid[] + = N_("E1111: List item %d range invalid"); +static char e_list_item_nr_cell_width_invalid[] + = N_("E1112: List item %d cell width invalid"); +static char e_overlapping_ranges_for_nr[] + = N_("E1113: Overlapping ranges for 0x%lx"); +static char e_only_values_of_0x100_and_higher_supported[] + = N_("E1114: Only values of 0x100 and higher supported"); + // To speed up BYTELEN(); keep a lookup table to quickly get the length in // bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes // which are illegal when used as the first byte have a 1. The NUL byte has @@ -472,13 +485,18 @@ static bool intable(const struct interval *table, size_t n_items, int c) int utf_char2cells(int c) { if (c >= 0x100) { + int n = cw_value(c); + if (n != 0) { + return n; + } + if (!utf_printable(c)) { return 6; // unprintable, displays <xxxx> } if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) { return 2; } - if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) { + if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) { return 2; } } else if (c >= 0x80 && !vim_isprintc(c)) { @@ -736,21 +754,19 @@ bool utf_composinglike(const char_u *p1, const char_u *p2) /// space at least for #MAX_MCO + 1 elements. /// /// @return leading character. -int utfc_ptr2char(const char_u *p, int *pcc) +int utfc_ptr2char(const char *p_in, int *pcc) { - int len; - int c; - int cc; + uint8_t *p = (uint8_t *)p_in; int i = 0; - c = utf_ptr2char((char *)p); - len = utf_ptr2len((char *)p); + int c = utf_ptr2char((char *)p); + int len = utf_ptr2len((char *)p); // Only accept a composing char when the first char isn't illegal. if ((len > 1 || *p < 0x80) && p[len] >= 0x80 && utf_composinglike(p, p + len)) { - cc = utf_ptr2char((char *)p + len); + int cc = utf_ptr2char((char *)p + len); for (;;) { pcc[i++] = cc; if (i == MAX_MCO) { @@ -864,7 +880,7 @@ int utf_ptr2len_len(const char_u *p, int size) } else { m = len; } - for (i = 1; i < m; ++i) { + for (i = 1; i < m; i++) { if ((p[i] & 0xc0) != 0x80) { return 1; } @@ -872,9 +888,9 @@ int utf_ptr2len_len(const char_u *p, int size) return len; } -/// Return the number of bytes occupied by a UTF-8 character in a string -/// +/// Return the number of bytes occupied by a UTF-8 character in a string. /// This includes following composing characters. +/// Returns zero for NUL. int utfc_ptr2len(const char *const p_in) FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL { @@ -988,8 +1004,9 @@ int utf_char2len(const int c) /// Convert Unicode character to UTF-8 string /// -/// @param c character to convert to \p buf -/// @param[out] buf UTF-8 string generated from \p c, does not add \0 +/// @param c character to convert to UTF-8 string in \p buf +/// @param[out] buf UTF-8 string generated from \p c, does not add \0 +/// must have room for at least 6 bytes /// @return Number of bytes (1-6). int utf_char2bytes(const int c, char *const buf) { @@ -1164,6 +1181,11 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 1; // punctuation } + // emoji + if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { + return 3; + } + // binary search in table while (top >= bot) { mid = (bot + top) / 2; @@ -1176,11 +1198,6 @@ int utf_class_tab(const int c, const uint64_t *const chartab) } } - // emoji - if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) { - return 3; - } - // most other characters are "word" characters return 2; } @@ -1576,7 +1593,7 @@ void show_utf8(void) } clen = 0; - for (i = 0; i < len; ++i) { + for (i = 0; i < len; i++) { if (clen == 0) { // start of (composing) character, get its length if (i > 0) { @@ -1587,7 +1604,7 @@ void show_utf8(void) } sprintf((char *)IObuff + rlen, "%02x ", (line[i] == NL) ? NUL : line[i]); // NUL is stored as NL - --clen; + clen--; rlen += (int)STRLEN(IObuff + rlen); if (rlen > IOSIZE - 20) { break; @@ -1613,14 +1630,14 @@ int utf_head_off(const char_u *base, const char_u *p) // Skip backwards over trailing bytes: 10xx.xxxx // Skip backwards again if on a composing char. const char_u *q; - for (q = p;; --q) { + for (q = p;; q--) { // Move s to the last byte of this char. const char_u *s; - for (s = q; (s[1] & 0xc0) == 0x80; ++s) {} + for (s = q; (s[1] & 0xc0) == 0x80; s++) {} // Move q to the first byte of this char. while (q > base && (*q & 0xc0) == 0x80) { - --q; + q--; } // Check for illegal sequence. Do allow an illegal byte after where we // started. @@ -1641,10 +1658,10 @@ int utf_head_off(const char_u *base, const char_u *p) if (arabic_maycombine(c)) { // Advance to get a sneak-peak at the next char const char_u *j = q; - --j; + j--; // Move j to the first byte of this char. while (j > base && (*j & 0xc0) == 0x80) { - --j; + j--; } if (arabic_combine(utf_ptr2char((char *)j), c)) { continue; @@ -1800,9 +1817,9 @@ bool utf_allow_break(int cc, int ncc) /// /// @param[in,out] fp Source of the character to copy. /// @param[in,out] tp Destination to copy to. -void mb_copy_char(const char_u **const fp, char_u **const tp) +void mb_copy_char(const char **const fp, char **const tp) { - const size_t l = (size_t)utfc_ptr2len((char *)(*fp)); + const size_t l = (size_t)utfc_ptr2len(*fp); memmove(*tp, *fp, l); *tp += l; @@ -1913,7 +1930,7 @@ void utf_find_illegal(void) char_u *tofree = NULL; vimconv.vc_type = CONV_NONE; - if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) { + if (enc_canon_props((char_u *)curbuf->b_p_fenc) & ENC_8BIT) { // 'encoding' is "utf-8" but we are editing a 8-bit encoded file, // possibly a utf-8 file with illegal bytes. Setup for conversion // from utf-8 to 'fileencoding'. @@ -1925,7 +1942,7 @@ void utf_find_illegal(void) p = get_cursor_pos_ptr(); if (vimconv.vc_type != CONV_NONE) { xfree(tofree); - tofree = string_convert(&vimconv, p, NULL); + tofree = (char_u *)string_convert(&vimconv, (char *)p, NULL); if (tofree == NULL) { break; } @@ -1956,7 +1973,7 @@ void utf_find_illegal(void) if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) { break; } - ++curwin->w_cursor.lnum; + curwin->w_cursor.lnum++; curwin->w_cursor.col = 0; } @@ -1970,8 +1987,7 @@ theend: } /// @return true if string "s" is a valid utf-8 string. -/// When "end" is NULL stop at the first NUL. -/// When "end" is positive stop there. +/// When "end" is NULL stop at the first NUL. Otherwise stop at "end". bool utf_valid_string(const char_u *s, const char_u *end) { const char_u *p = s; @@ -2128,10 +2144,8 @@ const char *mb_unescape(const char **const pp) return NULL; } -/* - * Skip the Vim specific head of a 'encoding' name. - */ -char_u *enc_skip(char_u *p) +/// Skip the Vim specific head of a 'encoding' name. +char *enc_skip(char *p) { if (STRNCMP(p, "2byte-", 6) == 0) { return p + 6; @@ -2142,27 +2156,25 @@ char_u *enc_skip(char_u *p) return p; } -/* - * Find the canonical name for encoding "enc". - * When the name isn't recognized, returns "enc" itself, but with all lower - * case characters and '_' replaced with '-'. - * Returns an allocated string. - */ -char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET +/// Find the canonical name for encoding "enc". +/// When the name isn't recognized, returns "enc" itself, but with all lower +/// case characters and '_' replaced with '-'. +/// +/// @return an allocated string. +char *enc_canonize(char *enc) + FUNC_ATTR_NONNULL_RET { char_u *p, *s; - int i; - if (STRCMP(enc, "default") == 0) { // Use the default encoding as found by set_init_1(). - return vim_strsave(fenc_default); + return (char *)vim_strsave(fenc_default); } // copy "enc" to allocated memory, with room for two '-' char_u *r = xmalloc(STRLEN(enc) + 3); // Make it all lower case and replace '_' with '-'. p = r; - for (s = enc; *s != NUL; ++s) { + for (s = (char_u *)enc; *s != NUL; s++) { if (*s == '_') { *p++ = '-'; } else { @@ -2172,7 +2184,7 @@ char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET *p = NUL; // Skip "2byte-" and "8bit-". - p = enc_skip(r); + p = (char_u *)enc_skip((char *)r); // Change "microsoft-cp" to "cp". Used in some spell files. if (STRNCMP(p, "microsoft-cp", 12) == 0) { @@ -2196,6 +2208,7 @@ char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET STRMOVE(p + 5, p + 6); } + int i; if (enc_canon_search(p) >= 0) { // canonical name can be used unmodified if (p != r) { @@ -2206,7 +2219,7 @@ char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET xfree(r); r = vim_strsave((char_u *)enc_canon_table[i].name); } - return r; + return (char *)r; } /// Search for an encoding alias of "name". @@ -2215,7 +2228,7 @@ static int enc_alias_search(const char_u *name) { int i; - for (i = 0; enc_alias_table[i].name != NULL; ++i) { + for (i = 0; enc_alias_table[i].name != NULL; i++) { if (STRCMP(name, enc_alias_table[i].name) == 0) { return enc_alias_table[i].canon; } @@ -2291,7 +2304,7 @@ enc_locale_copy_enc: buf[i] = NUL; } - return enc_canonize((char_u *)buf); + return (char_u *)enc_canonize(buf); } #if defined(HAVE_ICONV) @@ -2314,7 +2327,7 @@ void *my_iconv_open(char_u *to, char_u *from) if (iconv_working == kBroken) { return (void *)-1; // detected a broken iconv() previously } - fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from)); + fd = iconv_open(enc_skip((char *)to), enc_skip((char *)from)); if (fd != (iconv_t)-1 && iconv_working == kUnknown) { /* @@ -2425,18 +2438,17 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen #endif // HAVE_ICONV -/* - * Setup "vcp" for conversion from "from" to "to". - * The names must have been made canonical with enc_canonize(). - * vcp->vc_type must have been initialized to CONV_NONE. - * Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8 - * instead). - * Afterwards invoke with "from" and "to" equal to NULL to cleanup. - * Return FAIL when conversion is not supported, OK otherwise. - */ -int convert_setup(vimconv_T *vcp, char_u *from, char_u *to) +/// Setup "vcp" for conversion from "from" to "to". +/// The names must have been made canonical with enc_canonize(). +/// vcp->vc_type must have been initialized to CONV_NONE. +/// Note: cannot be used for conversion from/to ucs-2 and ucs-4 (will use utf-8 +/// instead). +/// Afterwards invoke with "from" and "to" equal to NULL to cleanup. +/// +/// @return FAIL when conversion is not supported, OK otherwise. +int convert_setup(vimconv_T *vcp, char *from, char *to) { - return convert_setup_ext(vcp, from, true, to, true); + return convert_setup_ext(vcp, (char_u *)from, true, (char_u *)to, true); } /// As convert_setup(), but only when from_unicode_is_utf8 is true will all @@ -2509,16 +2521,14 @@ int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, c return OK; } -/* - * Convert text "ptr[*lenp]" according to "vcp". - * Returns the result in allocated memory and sets "*lenp". - * When "lenp" is NULL, use NUL terminated strings. - * Illegal chars are often changed to "?", unless vcp->vc_fail is set. - * When something goes wrong, NULL is returned and "*lenp" is unchanged. - */ -char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp) +/// Convert text "ptr[*lenp]" according to "vcp". +/// Returns the result in allocated memory and sets "*lenp". +/// When "lenp" is NULL, use NUL terminated strings. +/// Illegal chars are often changed to "?", unless vcp->vc_fail is set. +/// When something goes wrong, NULL is returned and "*lenp" is unchanged. +char *string_convert(const vimconv_T *const vcp, char *ptr, size_t *lenp) { - return string_convert_ext(vcp, ptr, lenp, NULL); + return (char *)string_convert_ext(vcp, (char_u *)ptr, lenp, NULL); } /* @@ -2548,7 +2558,7 @@ char_u *string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp case CONV_TO_UTF8: // latin1 to utf-8 conversion retval = xmalloc(len * 2 + 1); d = retval; - for (size_t i = 0; i < len; ++i) { + for (size_t i = 0; i < len; i++) { c = ptr[i]; if (c < 0x80) { *d++ = (char_u)c; @@ -2566,7 +2576,7 @@ char_u *string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp case CONV_9_TO_UTF8: // latin9 to utf-8 conversion retval = xmalloc(len * 3 + 1); d = retval; - for (size_t i = 0; i < len; ++i) { + for (size_t i = 0; i < len; i++) { c = ptr[i]; switch (c) { case 0xa4: @@ -2678,3 +2688,174 @@ char_u *string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp return retval; } + +/// Table set by setcellwidths(). +typedef struct { + long first; + long last; + char width; +} cw_interval_T; + +static cw_interval_T *cw_table = NULL; +static size_t cw_table_size = 0; + +/// Return the value of the cellwidth table for the character `c`. +/// +/// @param c The source character. +/// @return 1 or 2 when `c` is in the cellwidth table, 0 if not. +static int cw_value(int c) +{ + if (cw_table == NULL) { + return 0; + } + + // first quick check for Latin1 etc. characters + if (c < cw_table[0].first) { + return 0; + } + + // binary search in table + int bot = 0; + int top = (int)cw_table_size - 1; + while (top >= bot) { + int mid = (bot + top) / 2; + if (cw_table[mid].last < c) { + bot = mid + 1; + } else if (cw_table[mid].first > c) { + top = mid - 1; + } else { + return cw_table[mid].width; + } + } + return 0; +} + +static int tv_nr_compare(const void *a1, const void *a2) +{ + const listitem_T *const li1 = tv_list_first(*(const list_T **)a1); + const listitem_T *const li2 = tv_list_first(*(const list_T **)a2); + + return (int)(TV_LIST_ITEM_TV(li1)->vval.v_number - TV_LIST_ITEM_TV(li2)->vval.v_number); +} + +/// "setcellwidths()" function +void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) +{ + if (argvars[0].v_type != VAR_LIST || argvars[0].vval.v_list == NULL) { + emsg(_(e_listreq)); + return; + } + const list_T *const l = argvars[0].vval.v_list; + if (tv_list_len(l) == 0) { + // Clearing the table. + xfree(cw_table); + cw_table = NULL; + cw_table_size = 0; + return; + } + + // Note: use list_T instead of listitem_T so that TV_LIST_ITEM_NEXT can be used properly below. + const list_T **ptrs = xmalloc(sizeof(const list_T *) * (size_t)tv_list_len(l)); + + // Check that all entries are a list with three numbers, the range is + // valid and the cell width is valid. + int item = 0; + TV_LIST_ITER_CONST(l, li, { + const typval_T *const li_tv = TV_LIST_ITEM_TV(li); + + if (li_tv->v_type != VAR_LIST || li_tv->vval.v_list == NULL) { + semsg(_(e_list_item_nr_is_not_list), item); + xfree(ptrs); + return; + } + + const list_T *const li_l = li_tv->vval.v_list; + ptrs[item] = li_l; + const listitem_T *lili = tv_list_first(li_l); + int i; + varnumber_T n1; + for (i = 0; lili != NULL; lili = TV_LIST_ITEM_NEXT(li_l, lili), i++) { + const typval_T *const lili_tv = TV_LIST_ITEM_TV(lili); + if (lili_tv->v_type != VAR_NUMBER) { + break; + } + if (i == 0) { + n1 = lili_tv->vval.v_number; + if (n1 < 0x100) { + emsg(_(e_only_values_of_0x100_and_higher_supported)); + xfree(ptrs); + return; + } + } else if (i == 1 && lili_tv->vval.v_number < n1) { + semsg(_(e_list_item_nr_range_invalid), item); + xfree(ptrs); + return; + } else if (i == 2 && (lili_tv->vval.v_number < 1 || lili_tv->vval.v_number > 2)) { + semsg(_(e_list_item_nr_cell_width_invalid), item); + xfree(ptrs); + return; + } + } + + if (i != 3) { + semsg(_(e_list_item_nr_does_not_contain_3_numbers), item); + xfree(ptrs); + return; + } + + item++; + }); + + // Sort the list on the first number. + qsort((void *)ptrs, (size_t)tv_list_len(l), sizeof(const list_T *), tv_nr_compare); + + cw_interval_T *table = xmalloc(sizeof(cw_interval_T) * (size_t)tv_list_len(l)); + + // Store the items in the new table. + for (item = 0; item < tv_list_len(l); item++) { + const list_T *const li_l = ptrs[item]; + const listitem_T *lili = tv_list_first(li_l); + const varnumber_T n1 = TV_LIST_ITEM_TV(lili)->vval.v_number; + if (item > 0 && n1 <= table[item - 1].last) { + semsg(_(e_overlapping_ranges_for_nr), (long)n1); + xfree(ptrs); + xfree(table); + return; + } + table[item].first = n1; + lili = TV_LIST_ITEM_NEXT(li_l, lili); + table[item].last = TV_LIST_ITEM_TV(lili)->vval.v_number; + lili = TV_LIST_ITEM_NEXT(li_l, lili); + table[item].width = (char)TV_LIST_ITEM_TV(lili)->vval.v_number; + } + + xfree(ptrs); + + cw_interval_T *const cw_table_save = cw_table; + const size_t cw_table_size_save = cw_table_size; + cw_table = table; + cw_table_size = (size_t)tv_list_len(l); + + // Check that the new value does not conflict with 'listchars' or + // 'fillchars'. + const char *const error = check_chars_options(); + if (error != NULL) { + emsg(_(error)); + cw_table = cw_table_save; + cw_table_size = cw_table_size_save; + xfree(table); + return; + } + + xfree(cw_table_save); + redraw_all_later(UPD_NOT_VALID); +} + +void f_charclass(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) +{ + if (tv_check_for_string(&argvars[0]) == FAIL + || argvars[0].vval.v_string == NULL) { + return; + } + rettv->vval.v_number = mb_get_class((const char_u *)argvars[0].vval.v_string); +} |