diff options
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r-- | src/nvim/mbyte.c | 466 |
1 files changed, 229 insertions, 237 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index 8b50ba719a..f2883cc5c7 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -1,6 +1,3 @@ -// This is an open source non-commercial project. Dear PVS-Studio, please check -// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com - /// mbyte.c: Code specifically for handling multi-byte characters. /// Multibyte extensions partly by Sung-Hoon Baek /// @@ -29,18 +26,21 @@ #include <ctype.h> #include <errno.h> #include <iconv.h> +#include <locale.h> #include <stdbool.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <wchar.h> +#include <sys/types.h> #include <wctype.h> #include "auto/config.h" #include "nvim/arabic.h" -#include "nvim/ascii.h" +#include "nvim/ascii_defs.h" #include "nvim/buffer_defs.h" #include "nvim/charset.h" +#include "nvim/cmdexpand_defs.h" #include "nvim/cursor.h" #include "nvim/drawscreen.h" #include "nvim/eval/typval.h" @@ -48,28 +48,23 @@ #include "nvim/getchar.h" #include "nvim/gettext.h" #include "nvim/globals.h" -#include "nvim/grid_defs.h" -#include "nvim/iconv.h" +#include "nvim/grid.h" +#include "nvim/iconv_defs.h" #include "nvim/keycodes.h" -#include "nvim/macros.h" +#include "nvim/macros_defs.h" #include "nvim/mark.h" #include "nvim/mbyte.h" #include "nvim/mbyte_defs.h" #include "nvim/memline.h" #include "nvim/memory.h" #include "nvim/message.h" -#include "nvim/option_defs.h" +#include "nvim/option_vars.h" +#include "nvim/optionstr.h" #include "nvim/os/os.h" -#include "nvim/os/os_defs.h" -#include "nvim/pos.h" -#include "nvim/screen.h" +#include "nvim/pos_defs.h" #include "nvim/strings.h" -#include "nvim/types.h" -#include "nvim/vim.h" - -#ifdef HAVE_LOCALE_H -# include <locale.h> -#endif +#include "nvim/types_defs.h" +#include "nvim/vim_defs.h" typedef struct { int rangeStart; @@ -79,8 +74,8 @@ typedef struct { } convertStruct; struct interval { - long first; - long last; + int first; + int last; }; // uncrustify:off @@ -90,17 +85,17 @@ struct interval { #endif // uncrustify:on -static char e_list_item_nr_is_not_list[] +static const char e_list_item_nr_is_not_list[] = N_("E1109: List item %d is not a List"); -static char e_list_item_nr_does_not_contain_3_numbers[] +static const char e_list_item_nr_does_not_contain_3_numbers[] = N_("E1110: List item %d does not contain 3 numbers"); -static char e_list_item_nr_range_invalid[] +static const char e_list_item_nr_range_invalid[] = N_("E1111: List item %d range invalid"); -static char e_list_item_nr_cell_width_invalid[] +static const char e_list_item_nr_cell_width_invalid[] = N_("E1112: List item %d cell width invalid"); -static char e_overlapping_ranges_for_nr[] +static const char e_overlapping_ranges_for_nr[] = N_("E1113: Overlapping ranges for 0x%lx"); -static char e_only_values_of_0x80_and_higher_supported[] +static const char e_only_values_of_0x80_and_higher_supported[] = N_("E1114: Only values of 0x80 and higher supported"); // To speed up BYTELEN(); keep a lookup table to quickly get the length in @@ -370,7 +365,7 @@ static int enc_canon_search(const char *name) int enc_canon_props(const char *name) FUNC_ATTR_PURE { - int i = enc_canon_search((char *)name); + int i = enc_canon_search(name); if (i >= 0) { return enc_canon_table[i].prop; } else if (strncmp(name, "2byte-", 6) == 0) { @@ -449,18 +444,16 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab) static bool intable(const struct interval *table, size_t n_items, int c) FUNC_ATTR_PURE { - int mid, bot, top; - // first quick check for Latin1 etc. characters if (c < table[0].first) { return false; } // binary search in table - bot = 0; - top = (int)(n_items - 1); + int bot = 0; + int top = (int)(n_items - 1); while (top >= bot) { - mid = (bot + top) / 2; + int mid = (bot + top) / 2; if (table[mid].last < c) { bot = mid + 1; } else if (table[mid].first > c) { @@ -518,11 +511,9 @@ int utf_char2cells(int c) /// This doesn't take care of unprintable characters, use ptr2cells() for that. int utf_ptr2cells(const char *p) { - int c; - // Need to convert to a character number. if ((uint8_t)(*p) >= 0x80) { - c = utf_ptr2char(p); + int c = utf_ptr2char(p); // An illegal byte is displayed as <xx>. if (utf_ptr2len(p) == 1 || c == NUL) { return 4; @@ -540,16 +531,14 @@ int utf_ptr2cells(const char *p) /// For an empty string or truncated character returns 1. int utf_ptr2cells_len(const char *p, int size) { - int c; - // Need to convert to a wide character. if (size > 0 && (uint8_t)(*p) >= 0x80) { if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) { return 1; // truncated } - c = utf_ptr2char((char *)p); + int c = utf_ptr2char(p); // An illegal byte is displayed as <xx>. - if (utf_ptr2len((char *)p) == 1 || c == NUL) { + if (utf_ptr2len(p) == 1 || c == NUL) { return 4; } // If the char is ASCII it must be an overlong sequence. @@ -662,34 +651,32 @@ int utf_ptr2char(const char *const p_in) // // If byte sequence is illegal or incomplete, returns -1 and does not advance // "s". -static int utf_safe_read_char_adv(const char_u **s, size_t *n) +static int utf_safe_read_char_adv(const char **s, size_t *n) { - int c; - if (*n == 0) { // end of buffer return 0; } - uint8_t k = utf8len_tab_zero[**s]; + uint8_t k = utf8len_tab_zero[(uint8_t)(**s)]; if (k == 1) { // ASCII character or NUL (*n)--; - return *(*s)++; + return (uint8_t)(*(*s)++); } if (k <= *n) { // We have a multibyte sequence and it isn't truncated by buffer // limits so utf_ptr2char() is safe to use. Or the first byte is // illegal (k=0), and it's also safe to use utf_ptr2char(). - c = utf_ptr2char((char *)(*s)); + int c = utf_ptr2char(*s); // On failure, utf_ptr2char() returns the first byte, so here we // check equality with the first byte. The only non-ASCII character // which equals the first byte of its own UTF-8 representation is // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. // It's safe even if n=1, else we would have k=2 > n. - if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) { + if (c != (int)((uint8_t)(**s)) || (c == 0xC3 && (uint8_t)(*s)[1] == 0x83)) { // byte sequence was successfully decoded *s += k; *n -= k; @@ -705,9 +692,7 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n) // Note: composing characters are skipped! int mb_ptr2char_adv(const char **const pp) { - int c; - - c = utf_ptr2char(*pp); + int c = utf_ptr2char(*pp); *pp += utfc_ptr2len(*pp); return c; } @@ -716,9 +701,7 @@ int mb_ptr2char_adv(const char **const pp) // Note: composing characters are returned as separate characters. int mb_cptr2char_adv(const char **pp) { - int c; - - c = utf_ptr2char(*pp); + int c = utf_ptr2char(*pp); *pp += utf_ptr2len(*pp); return c; } @@ -728,92 +711,78 @@ int mb_cptr2char_adv(const char **pp) /// behaves like a composing character. bool utf_composinglike(const char *p1, const char *p2) { - int c2; - - c2 = utf_ptr2char((char *)p2); + int c2 = utf_ptr2char(p2); if (utf_iscomposing(c2)) { return true; } if (!arabic_maycombine(c2)) { return false; } - return arabic_combine(utf_ptr2char((char *)p1), c2); + return arabic_combine(utf_ptr2char(p1), c2); } -/// Convert a UTF-8 string to a wide character +/// Get the screen char at the beginning of a string +/// +/// Caller is expected to check for things like unprintable chars etc +/// If first char in string is a composing char, prepend a space to display it correctly. /// -/// Also gets up to #MAX_MCO composing characters. +/// If "p" starts with an invalid sequence, zero is returned. /// -/// @param[out] pcc Location where to store composing characters. Must have -/// space at least for #MAX_MCO + 1 elements. +/// @param[out] firstc (required) The first codepoint of the screen char, +/// or the first byte of an invalid sequence /// -/// @return leading character. -int utfc_ptr2char(const char *p, int *pcc) +/// @return the char +schar_T utfc_ptr2schar(const char *p, int *firstc) + FUNC_ATTR_NONNULL_ALL { - int i = 0; - int c = utf_ptr2char(p); - int len = utf_ptr2len(p); + *firstc = c; // NOT optional, you are gonna need it + bool first_compose = utf_iscomposing(c); + size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose; + size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen); - // Only accept a composing char when the first char isn't illegal. - if ((len > 1 || (uint8_t)(*p) < 0x80) - && (uint8_t)p[len] >= 0x80 - && utf_composinglike(p, p + len)) { - int cc = utf_ptr2char(p + len); - for (;;) { - pcc[i++] = cc; - if (i == MAX_MCO) { - break; - } - len += utf_ptr2len(p + len); - if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) { - break; - } - } - } - - if (i < MAX_MCO) { // last composing char must be 0 - pcc[i] = 0; + if (len == 1 && (uint8_t)(*p) >= 0x80) { + return 0; // invalid sequence } - return c; + return schar_from_buf_first(p, len, first_compose); } -// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO -// composing characters. Use no more than p[maxlen]. -// -// @param [out] pcc: composing chars, last one is 0 -int utfc_ptr2char_len(const char *p, int *pcc, int maxlen) +/// Get the screen char at the beginning of a string with length +/// +/// Like utfc_ptr2schar but use no more than p[maxlen]. +schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc) + FUNC_ATTR_NONNULL_ALL { assert(maxlen > 0); - int i = 0; + size_t len = (size_t)utf_ptr2len_len(p, maxlen); + if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) { + // invalid or truncated sequence + *firstc = (uint8_t)(*p); + return 0; + } - int len = utf_ptr2len_len(p, maxlen); - // Is it safe to use utf_ptr2char()? - bool safe = len > 1 && len <= maxlen; - int c = safe ? utf_ptr2char(p) : (uint8_t)(*p); + int c = utf_ptr2char(p); + *firstc = c; + bool first_compose = utf_iscomposing(c); + maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose); + len = (size_t)utfc_ptr2len_len(p, maxlen); - // Only accept a composing char when the first char isn't illegal. - if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) { - for (; i < MAX_MCO; i++) { - int len_cc = utf_ptr2len_len(p + len, maxlen - len); - safe = len_cc > 1 && len_cc <= maxlen - len; - if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80 - || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) { - break; - } - len += len_cc; - } - } + return schar_from_buf_first(p, len, first_compose); +} - if (i < MAX_MCO) { - // last composing char must be 0 - pcc[i] = 0; +/// Caller must ensure there is space for `first_compose` +static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose) +{ + if (first_compose) { + char cbuf[MAX_SCHAR_SIZE]; + cbuf[0] = ' '; + memcpy(cbuf + 1, buf, len); + return schar_from_buf(cbuf, len + 1); + } else { + return schar_from_buf(buf, len); } - - return c; -#undef ISCOMPOSING } /// Get the length of a UTF-8 byte sequence representing a single codepoint @@ -854,11 +823,9 @@ int utf_byte2len(int b) // Never returns zero. int utf_ptr2len_len(const char *p, int size) { - int len; - int i; int m; - len = utf8len_tab[(uint8_t)(*p)]; + int len = utf8len_tab[(uint8_t)(*p)]; if (len == 1) { return 1; // NUL, ascii or illegal lead byte } @@ -867,7 +834,7 @@ int utf_ptr2len_len(const char *p, int size) } else { m = len; } - for (i = 1; i < m; i++) { + for (int i = 1; i < m; i++) { if ((p[i] & 0xc0) != 0x80) { return 1; } @@ -898,10 +865,9 @@ int utfc_ptr2len(const char *const p) return 1; } - // Check for composing characters. We can handle only the first six, but - // skip all of them (otherwise the cursor would get stuck). + // Check for composing characters. int prevlen = 0; - for (;;) { + while (true) { if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) { return len; } @@ -918,9 +884,6 @@ int utfc_ptr2len(const char *const p) /// Returns 1 for an illegal char or an incomplete byte sequence. int utfc_ptr2len_len(const char *p, int size) { - int len; - int prevlen; - if (size < 1 || *p == NUL) { return 0; } @@ -929,7 +892,7 @@ int utfc_ptr2len_len(const char *p, int size) } // Skip over first UTF-8 char, stopping at a NUL byte. - len = utf_ptr2len_len(p, size); + int len = utf_ptr2len_len(p, size); // Check for illegal byte and incomplete byte sequence. if ((len == 1 && (uint8_t)p[0] >= 0x80) || len > size) { @@ -938,17 +901,15 @@ int utfc_ptr2len_len(const char *p, int size) // Check for composing characters. We can handle only the first six, but // skip all of them (otherwise the cursor would get stuck). - prevlen = 0; + int prevlen = 0; while (len < size) { - int len_next_char; - if ((uint8_t)p[len] < 0x80) { break; } // Next character length should not go beyond size to ensure that // utf_composinglike(...) does not read beyond size. - len_next_char = utf_ptr2len_len(p + len, size - len); + int len_next_char = utf_ptr2len_len(p + len, size - len); if (len_next_char > size - len) { break; } @@ -1063,9 +1024,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab) { // sorted list of non-overlapping intervals static struct clinterval { - unsigned int first; - unsigned int last; - unsigned int class; + unsigned first; + unsigned last; + unsigned cls; } classes[] = { { 0x037e, 0x037e, 1 }, // Greek question mark { 0x0387, 0x0387, 1 }, // Greek ano teleia @@ -1141,7 +1102,6 @@ int utf_class_tab(const int c, const uint64_t *const chartab) }; int bot = 0; int top = ARRAY_SIZE(classes) - 1; - int mid; // First quick check for Latin1 characters, use 'iskeyword'. if (c < 0x100) { @@ -1161,13 +1121,13 @@ int utf_class_tab(const int c, const uint64_t *const chartab) // binary search in table while (top >= bot) { - mid = (bot + top) / 2; - if (classes[mid].last < (unsigned int)c) { + int mid = (bot + top) / 2; + if (classes[mid].last < (unsigned)c) { bot = mid + 1; - } else if (classes[mid].first > (unsigned int)c) { + } else if (classes[mid].first > (unsigned)c) { top = mid - 1; } else { - return (int)classes[mid].class; + return (int)classes[mid].cls; } } @@ -1186,13 +1146,12 @@ bool utf_ambiguous_width(int c) // the given conversion "table". Uses binary search on "table". static int utf_convert(int a, const convertStruct *const table, size_t n_items) { - size_t start, mid, end; // indices into table - - start = 0; - end = n_items; + // indices into table + size_t start = 0; + size_t end = n_items; while (start < end) { // need to search further - mid = (end + start) / 2; + size_t mid = (end + start) / 2; if (table[mid].rangeEnd < a) { start = mid + 1; } else { @@ -1285,12 +1244,12 @@ bool mb_isalpha(int a) return mb_islower(a) || mb_isupper(a); } -static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2) +static int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2) { - int c1, c2, cdiff; + int c1, c2; char buffer[6]; - for (;;) { + while (true) { c1 = utf_safe_read_char_adv(&s1, &n1); c2 = utf_safe_read_char_adv(&s2, &n2); @@ -1302,7 +1261,7 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2 continue; } - cdiff = utf_fold(c1) - utf_fold(c2); + int cdiff = utf_fold(c1) - utf_fold(c2); if (cdiff != 0) { return cdiff; } @@ -1326,15 +1285,15 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2 // to fold just one character to determine the result of comparison. if (c1 != -1 && c2 == -1) { - n1 = (size_t)utf_char2bytes(utf_fold(c1), (char *)buffer); - s1 = (char_u *)buffer; + n1 = (size_t)utf_char2bytes(utf_fold(c1), buffer); + s1 = buffer; } else if (c2 != -1 && c1 == -1) { - n2 = (size_t)utf_char2bytes(utf_fold(c2), (char *)buffer); - s2 = (char_u *)buffer; + n2 = (size_t)utf_char2bytes(utf_fold(c2), buffer); + s2 = buffer; } while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) { - cdiff = (int)(*s1) - (int)(*s2); + int cdiff = (int)((uint8_t)(*s1)) - (int)((uint8_t)(*s2)); if (cdiff != 0) { return cdiff; } @@ -1483,11 +1442,11 @@ ssize_t mb_utf_index_to_bytes(const char *s, size_t len, size_t index, bool use_ FUNC_ATTR_NONNULL_ALL { size_t count = 0; - size_t clen, i; + size_t clen; if (index == 0) { return 0; } - for (i = 0; i < len; i += clen) { + for (size_t i = 0; i < len; i += clen) { clen = (size_t)utf_ptr2len_len(s + i, (int)(len - i)); // NB: gets the byte value of invalid sequence bytes. // we only care whether the char fits in the BMP or not @@ -1512,7 +1471,7 @@ ssize_t mb_utf_index_to_bytes(const char *s, size_t len, size_t index, bool use_ /// two characters otherwise. int mb_strnicmp(const char *s1, const char *s2, const size_t nn) { - return utf_strnicmp((char_u *)s1, (char_u *)s2, nn, nn); + return utf_strnicmp(s1, s2, nn, nn); } /// Compare strings case-insensitively @@ -1536,23 +1495,18 @@ int mb_stricmp(const char *s1, const char *s2) // 'encoding' has been set to. void show_utf8(void) { - int len; - int rlen = 0; - char *line; - int clen; - int i; - // Get the byte length of the char under the cursor, including composing // characters. - line = get_cursor_pos_ptr(); - len = utfc_ptr2len(line); + char *line = get_cursor_pos_ptr(); + int len = utfc_ptr2len(line); if (len == 0) { - msg("NUL"); + msg("NUL", 0); return; } - clen = 0; - for (i = 0; i < len; i++) { + size_t rlen = 0; + int clen = 0; + for (int i = 0; i < len; i++) { if (clen == 0) { // start of (composing) character, get its length if (i > 0) { @@ -1561,16 +1515,17 @@ void show_utf8(void) } clen = utf_ptr2len(line + i); } - sprintf(IObuff + rlen, "%02x ", // NOLINT(runtime/printf) - (line[i] == NL) ? NUL : (uint8_t)line[i]); // NUL is stored as NL + assert(IOSIZE > rlen); + snprintf(IObuff + rlen, IOSIZE - rlen, "%02x ", + (line[i] == NL) ? NUL : (uint8_t)line[i]); // NUL is stored as NL clen--; - rlen += (int)strlen(IObuff + rlen); + rlen += strlen(IObuff + rlen); if (rlen > IOSIZE - 20) { break; } } - msg(IObuff); + msg(IObuff, 0); } /// Return offset from "p" to the start of a character, including composing characters. @@ -1579,9 +1534,6 @@ void show_utf8(void) /// Returns 0 when already at the first byte of a character. int utf_head_off(const char *base_in, const char *p_in) { - int c; - int len; - if ((uint8_t)(*p_in) < 0x80) { // be quick for ASCII return 0; } @@ -1603,7 +1555,7 @@ int utf_head_off(const char *base_in, const char *p_in) } // Check for illegal sequence. Do allow an illegal byte after where we // started. - len = utf8len_tab[*q]; + int len = utf8len_tab[*q]; if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) { return 0; } @@ -1612,7 +1564,7 @@ int utf_head_off(const char *base_in, const char *p_in) break; } - c = utf_ptr2char((char *)q); + int c = utf_ptr2char((char *)q); if (utf_iscomposing(c)) { continue; } @@ -1669,7 +1621,7 @@ bool utf_allow_break_before(int cc) 0x2021, // ‡ double dagger 0x2026, // … horizontal ellipsis 0x2030, // ‰ per mille sign - 0x2031, // ‱ per then thousand sign + 0x2031, // ‱ per the thousand sign 0x203c, // ‼ double exclamation mark 0x2047, // ⁇ double question mark 0x2048, // ⁈ question exclamation mark @@ -1795,7 +1747,6 @@ int mb_off_next(const char *base, const char *p_in) { const uint8_t *p = (uint8_t *)p_in; int i; - int j; if (*p < 0x80) { // be quick for ASCII return 0; @@ -1804,6 +1755,7 @@ int mb_off_next(const char *base, const char *p_in) // Find the next character that isn't 10xx.xxxx for (i = 0; (p[i] & 0xc0) == 0x80; i++) {} if (i > 0) { + int j; // Check for illegal sequence. for (j = 0; p - j > (uint8_t *)base; j++) { if ((p[-j] & 0xc0) != 0x80) { @@ -1849,33 +1801,35 @@ int utf_cp_tail_off(const char *base, const char *p_in) /// Return the offset from "p" to the first byte of the codepoint it points /// to. Can start anywhere in a stream of bytes. /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters -/// separately and returns a negative offset. +/// separately. /// /// @param[in] base Pointer to start of string /// @param[in] p Pointer to byte for which to return the offset to the previous codepoint // -/// @return 0 if invalid sequence, else offset to previous codepoint -int utf_cp_head_off(const char_u *base, const char_u *p) +/// @return 0 if invalid sequence, else number of bytes to previous codepoint +int utf_cp_head_off(const char *base, const char *p) { int i; - int j; if (*p == NUL) { return 0; } // Find the first character that is not 10xx.xxxx - for (i = 0; p - i > base; i--) { - if ((p[i] & 0xc0) != 0x80) { + for (i = 0; p - i >= base; i++) { + if (((uint8_t)p[-i] & 0xc0) != 0x80) { break; } } - // Find the last character that is 10xx.xxxx - for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {} + // Find the last character that is 10xx.xxxx (condition terminates on NUL) + int j = 1; + while (((uint8_t)p[j] & 0xc0) == 0x80) { + j++; + } // Check for illegal sequence. - if (utf8len_tab[p[i]] == 1) { + if (utf8len_tab[(uint8_t)p[-i]] != j + i) { return 0; } return i; @@ -1885,8 +1839,6 @@ int utf_cp_head_off(const char_u *base, const char_u *p) void utf_find_illegal(void) { pos_T pos = curwin->w_cursor; - char *p; - int len; vimconv_T vimconv; char *tofree = NULL; @@ -1899,8 +1851,8 @@ void utf_find_illegal(void) } curwin->w_cursor.coladd = 0; - for (;;) { - p = get_cursor_pos_ptr(); + while (true) { + char *p = get_cursor_pos_ptr(); if (vimconv.vc_type != CONV_NONE) { xfree(tofree); tofree = string_convert(&vimconv, p, NULL); @@ -1913,7 +1865,7 @@ void utf_find_illegal(void) while (*p != NUL) { // Illegal means that there are not enough trail bytes (checked by // utf_ptr2len()) or too many of them (overlong sequence). - len = utf_ptr2len(p); + int len = utf_ptr2len(p); if ((uint8_t)(*p) >= 0x80 && (len == 1 || utf_char2len(utf_ptr2char(p)) != len)) { if (vimconv.vc_type == CONV_NONE) { curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr()); @@ -1948,16 +1900,16 @@ theend: /// @return true if string "s" is a valid utf-8 string. /// When "end" is NULL stop at the first NUL. Otherwise stop at "end". -bool utf_valid_string(const char_u *s, const char_u *end) +bool utf_valid_string(const char *s, const char *end) { - const char_u *p = s; + const uint8_t *p = (uint8_t *)s; - while (end == NULL ? *p != NUL : p < end) { + while (end == NULL ? *p != NUL : p < (uint8_t *)end) { int l = utf8len_tab_zero[*p]; if (l == 0) { return false; // invalid lead byte } - if (end != NULL && p + l > end) { + if (end != NULL && p + l > (uint8_t *)end) { return false; // incomplete byte sequence } p++; @@ -1988,7 +1940,7 @@ void mb_check_adjust_col(void *win_) // Column 0 is always valid. if (oldcol != 0) { - char *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false); + char *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum); colnr_T len = (colnr_T)strlen(p); // Empty line or invalid column? @@ -2042,6 +1994,24 @@ int mb_charlen(const char *str) return count; } +int mb_charlen2bytelen(const char *str, int charlen) +{ + const char *p = str; + int count = 0; + + if (p == NULL) { + return 0; + } + + for (int i = 0; *p != NUL && i < charlen; i++) { + int b = utfc_ptr2len(p); + p += b; + count += b; + } + + return count; +} + /// Like mb_charlen() but for a string with specified length. int mb_charlen_len(const char *str, int len) { @@ -2122,7 +2092,6 @@ char *enc_skip(char *p) char *enc_canonize(char *enc) FUNC_ATTR_NONNULL_RET { - char *p, *s; if (strcmp(enc, "default") == 0) { // Use the default encoding as found by set_init_1(). return xstrdup(fenc_default); @@ -2131,8 +2100,8 @@ char *enc_canonize(char *enc) // copy "enc" to allocated memory, with room for two '-' char *r = xmalloc(strlen(enc) + 3); // Make it all lower case and replace '_' with '-'. - p = r; - for (s = enc; *s != NUL; s++) { + char *p = r; + for (char *s = enc; *s != NUL; s++) { if (*s == '_') { *p++ = '-'; } else { @@ -2184,9 +2153,7 @@ char *enc_canonize(char *enc) /// Returns -1 when not found. static int enc_alias_search(const char *name) { - int i; - - for (i = 0; enc_alias_table[i].name != NULL; i++) { + for (int i = 0; enc_alias_table[i].name != NULL; i++) { if (strcmp(name, enc_alias_table[i].name) == 0) { return enc_alias_table[i].canon; } @@ -2210,10 +2177,7 @@ char *enc_locale(void) if (!(s = nl_langinfo(CODESET)) || *s == NUL) #endif { -#if defined(HAVE_LOCALE_H) - if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL) -#endif - { + if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL) { if ((s = os_getenv("LC_ALL"))) { if ((s = os_getenv("LC_CTYPE"))) { s = os_getenv("LANG"); @@ -2269,17 +2233,14 @@ enc_locale_copy_enc: // (should return iconv_t, but that causes problems with prototypes). void *my_iconv_open(char *to, char *from) { - iconv_t fd; #define ICONV_TESTLEN 400 char tobuf[ICONV_TESTLEN]; - char *p; - size_t tolen; static WorkingStatus iconv_working = kUnknown; if (iconv_working == kBroken) { return (void *)-1; // detected a broken iconv() previously } - fd = iconv_open(enc_skip(to), enc_skip(from)); + iconv_t fd = iconv_open(enc_skip(to), enc_skip(from)); if (fd != (iconv_t)-1 && iconv_working == kUnknown) { // Do a dummy iconv() call to check if it actually works. There is a @@ -2287,8 +2248,8 @@ void *my_iconv_open(char *to, char *from) // because it's wide-spread. The symptoms are that after outputting // the initial shift state the "to" pointer is NULL and conversion // stops for no apparent reason after about 8160 characters. - p = tobuf; - tolen = ICONV_TESTLEN; + char *p = tobuf; + size_t tolen = ICONV_TESTLEN; (void)iconv(fd, NULL, NULL, &p, &tolen); if (p == NULL) { iconv_working = kBroken; @@ -2310,24 +2271,19 @@ void *my_iconv_open(char *to, char *from) static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t slen, size_t *unconvlenp, size_t *resultlenp) { - const char *from; - size_t fromlen; char *to; - size_t tolen; size_t len = 0; size_t done = 0; char *result = NULL; - char *p; - int l; - from = str; - fromlen = slen; - for (;;) { + const char *from = str; + size_t fromlen = slen; + while (true) { if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) { // Allocate enough room for most conversions. When re-allocating // increase the buffer size. len = len + fromlen * 2 + 40; - p = xmalloc(len); + char *p = xmalloc(len); if (done > 0) { memmove(p, result, done); } @@ -2336,7 +2292,7 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl } to = result + done; - tolen = len - done - 2; + size_t tolen = len - done - 2; // Avoid a warning for systems with a wrong iconv() prototype by // casting the second argument to void *. if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) { @@ -2366,7 +2322,7 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl if (utf_ptr2cells(from) > 1) { *to++ = '?'; } - l = utfc_ptr2len_len(from, (int)fromlen); + int l = utfc_ptr2len_len(from, (int)fromlen); from += l; fromlen -= (size_t)l; } else if (ICONV_ERRNO != ICONV_E2BIG) { @@ -2384,6 +2340,34 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl return result; } +/// iconv() function +void f_iconv(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) +{ + vimconv_T vimconv; + + rettv->v_type = VAR_STRING; + rettv->vval.v_string = NULL; + + const char *const str = tv_get_string(&argvars[0]); + char buf1[NUMBUFLEN]; + char *const from = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[1], buf1))); + char buf2[NUMBUFLEN]; + char *const to = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[2], buf2))); + vimconv.vc_type = CONV_NONE; + convert_setup(&vimconv, from, to); + + // If the encodings are equal, no conversion needed. + if (vimconv.vc_type == CONV_NONE) { + rettv->vval.v_string = xstrdup(str); + } else { + rettv->vval.v_string = string_convert(&vimconv, (char *)str, NULL); + } + + convert_setup(&vimconv, NULL, NULL); + xfree(from); + xfree(to); +} + /// Setup "vcp" for conversion from "from" to "to". /// The names must have been made canonical with enc_canonize(). /// vcp->vc_type must have been initialized to CONV_NONE. @@ -2402,8 +2386,6 @@ int convert_setup(vimconv_T *vcp, char *from, char *to) int convert_setup_ext(vimconv_T *vcp, char *from, bool from_unicode_is_utf8, char *to, bool to_unicode_is_utf8) { - int from_prop; - int to_prop; int from_is_utf8; int to_is_utf8; @@ -2419,8 +2401,8 @@ int convert_setup_ext(vimconv_T *vcp, char *from, bool from_unicode_is_utf8, cha return OK; } - from_prop = enc_canon_props(from); - to_prop = enc_canon_props(to); + int from_prop = enc_canon_props(from); + int to_prop = enc_canon_props(to); if (from_unicode_is_utf8) { from_is_utf8 = from_prop & ENC_UNICODE; } else { @@ -2477,9 +2459,8 @@ char *string_convert(const vimconv_T *const vcp, char *ptr, size_t *lenp) // set to the number of remaining bytes. char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, size_t *unconvlenp) { - char_u *retval = NULL; - char_u *d; - int l; + uint8_t *retval = NULL; + uint8_t *d; int c; size_t len; @@ -2499,10 +2480,10 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si for (size_t i = 0; i < len; i++) { c = (uint8_t)ptr[i]; if (c < 0x80) { - *d++ = (char_u)c; + *d++ = (uint8_t)c; } else { - *d++ = (char_u)(0xc0 + (char_u)((unsigned)c >> 6)); - *d++ = (char_u)(0x80 + (c & 0x3f)); + *d++ = (uint8_t)(0xc0 + (uint8_t)((unsigned)c >> 6)); + *d++ = (uint8_t)(0x80 + (c & 0x3f)); } } *d = NUL; @@ -2547,7 +2528,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si retval = xmalloc(len + 1); d = retval; for (size_t i = 0; i < len; i++) { - l = utf_ptr2len_len(ptr + i, (int)(len - i)); + int l = utf_ptr2len_len(ptr + i, (int)(len - i)); if (l == 0) { *d++ = NUL; } else if (l == 1) { @@ -2597,7 +2578,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si } if (!utf_iscomposing(c)) { // skip composing chars if (c < 0x100) { - *d++ = (char_u)c; + *d++ = (uint8_t)c; } else if (vcp->vc_fail) { xfree(retval); return NULL; @@ -2618,7 +2599,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si break; case CONV_ICONV: // conversion with vcp->vc_fd - retval = (char_u *)iconv_string(vcp, ptr, len, unconvlenp, lenp); + retval = (uint8_t *)iconv_string(vcp, ptr, len, unconvlenp, lenp); break; } @@ -2627,8 +2608,8 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si /// Table set by setcellwidths(). typedef struct { - long first; - long last; + int64_t first; + int64_t last; char width; } cw_interval_T; @@ -2753,7 +2734,7 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) const listitem_T *lili = tv_list_first(li_l); const varnumber_T n1 = TV_LIST_ITEM_TV(lili)->vval.v_number; if (item > 0 && n1 <= table[item - 1].last) { - semsg(_(e_overlapping_ranges_for_nr), (long)n1); + semsg(_(e_overlapping_ranges_for_nr), (size_t)n1); xfree((void *)ptrs); xfree(table); return; @@ -2810,3 +2791,14 @@ void f_charclass(typval_T *argvars, typval_T *rettv, EvalFuncData fptr) } rettv->vval.v_number = mb_get_class(argvars[0].vval.v_string); } + +/// Function given to ExpandGeneric() to obtain the possible arguments of the +/// encoding options. +char *get_encoding_name(expand_T *xp FUNC_ATTR_UNUSED, int idx) +{ + if (idx >= (int)ARRAY_SIZE(enc_canon_table)) { + return NULL; + } + + return (char *)enc_canon_table[idx].name; +} |