1 files changed, 229 insertions, 237 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 8b50ba719a..f2883cc5c7 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1,6 +1,3 @@
-// This is an open source non-commercial project. Dear PVS-Studio, please check
-// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
-
 /// mbyte.c: Code specifically for handling multi-byte characters.
 /// Multibyte extensions partly by Sung-Hoon Baek
 ///
@@ -29,18 +26,21 @@
 #include <ctype.h>
 #include <errno.h>
 #include <iconv.h>
+#include <locale.h>
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <wchar.h>
+#include <sys/types.h>
 #include <wctype.h>
 
 #include "auto/config.h"
 #include "nvim/arabic.h"
-#include "nvim/ascii.h"
+#include "nvim/ascii_defs.h"
 #include "nvim/buffer_defs.h"
 #include "nvim/charset.h"
+#include "nvim/cmdexpand_defs.h"
 #include "nvim/cursor.h"
 #include "nvim/drawscreen.h"
 #include "nvim/eval/typval.h"
@@ -48,28 +48,23 @@
 #include "nvim/getchar.h"
 #include "nvim/gettext.h"
 #include "nvim/globals.h"
-#include "nvim/grid_defs.h"
-#include "nvim/iconv.h"
+#include "nvim/grid.h"
+#include "nvim/iconv_defs.h"
 #include "nvim/keycodes.h"
-#include "nvim/macros.h"
+#include "nvim/macros_defs.h"
 #include "nvim/mark.h"
 #include "nvim/mbyte.h"
 #include "nvim/mbyte_defs.h"
 #include "nvim/memline.h"
 #include "nvim/memory.h"
 #include "nvim/message.h"
-#include "nvim/option_defs.h"
+#include "nvim/option_vars.h"
+#include "nvim/optionstr.h"
 #include "nvim/os/os.h"
-#include "nvim/os/os_defs.h"
-#include "nvim/pos.h"
-#include "nvim/screen.h"
+#include "nvim/pos_defs.h"
 #include "nvim/strings.h"
-#include "nvim/types.h"
-#include "nvim/vim.h"
-
-#ifdef HAVE_LOCALE_H
-# include <locale.h>
-#endif
+#include "nvim/types_defs.h"
+#include "nvim/vim_defs.h"
 
 typedef struct {
   int rangeStart;
@@ -79,8 +74,8 @@ typedef struct {
 } convertStruct;
 
 struct interval {
-  long first;
-  long last;
+  int first;
+  int last;
 };
 
 // uncrustify:off
@@ -90,17 +85,17 @@ struct interval {
 #endif
 // uncrustify:on
 
-static char e_list_item_nr_is_not_list[]
+static const char e_list_item_nr_is_not_list[]
   = N_("E1109: List item %d is not a List");
-static char e_list_item_nr_does_not_contain_3_numbers[]
+static const char e_list_item_nr_does_not_contain_3_numbers[]
   = N_("E1110: List item %d does not contain 3 numbers");
-static char e_list_item_nr_range_invalid[]
+static const char e_list_item_nr_range_invalid[]
   = N_("E1111: List item %d range invalid");
-static char e_list_item_nr_cell_width_invalid[]
+static const char e_list_item_nr_cell_width_invalid[]
   = N_("E1112: List item %d cell width invalid");
-static char e_overlapping_ranges_for_nr[]
+static const char e_overlapping_ranges_for_nr[]
   = N_("E1113: Overlapping ranges for 0x%lx");
-static char e_only_values_of_0x80_and_higher_supported[]
+static const char e_only_values_of_0x80_and_higher_supported[]
   = N_("E1114: Only values of 0x80 and higher supported");
 
 // To speed up BYTELEN(); keep a lookup table to quickly get the length in
@@ -370,7 +365,7 @@ static int enc_canon_search(const char *name)
 int enc_canon_props(const char *name)
   FUNC_ATTR_PURE
 {
-  int i = enc_canon_search((char *)name);
+  int i = enc_canon_search(name);
   if (i >= 0) {
     return enc_canon_table[i].prop;
   } else if (strncmp(name, "2byte-", 6) == 0) {
@@ -449,18 +444,16 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
 static bool intable(const struct interval *table, size_t n_items, int c)
   FUNC_ATTR_PURE
 {
-  int mid, bot, top;
-
   // first quick check for Latin1 etc. characters
   if (c < table[0].first) {
     return false;
   }
 
   // binary search in table
-  bot = 0;
-  top = (int)(n_items - 1);
+  int bot = 0;
+  int top = (int)(n_items - 1);
   while (top >= bot) {
-    mid = (bot + top) / 2;
+    int mid = (bot + top) / 2;
     if (table[mid].last < c) {
       bot = mid + 1;
     } else if (table[mid].first > c) {
@@ -518,11 +511,9 @@ int utf_char2cells(int c)
 /// This doesn't take care of unprintable characters, use ptr2cells() for that.
 int utf_ptr2cells(const char *p)
 {
-  int c;
-
   // Need to convert to a character number.
   if ((uint8_t)(*p) >= 0x80) {
-    c = utf_ptr2char(p);
+    int c = utf_ptr2char(p);
     // An illegal byte is displayed as <xx>.
     if (utf_ptr2len(p) == 1 || c == NUL) {
       return 4;
@@ -540,16 +531,14 @@ int utf_ptr2cells(const char *p)
 /// For an empty string or truncated character returns 1.
 int utf_ptr2cells_len(const char *p, int size)
 {
-  int c;
-
   // Need to convert to a wide character.
   if (size > 0 && (uint8_t)(*p) >= 0x80) {
     if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
       return 1;        // truncated
     }
-    c = utf_ptr2char((char *)p);
+    int c = utf_ptr2char(p);
     // An illegal byte is displayed as <xx>.
-    if (utf_ptr2len((char *)p) == 1 || c == NUL) {
+    if (utf_ptr2len(p) == 1 || c == NUL) {
       return 4;
     }
     // If the char is ASCII it must be an overlong sequence.
@@ -662,34 +651,32 @@ int utf_ptr2char(const char *const p_in)
 //
 // If byte sequence is illegal or incomplete, returns -1 and does not advance
 // "s".
-static int utf_safe_read_char_adv(const char_u **s, size_t *n)
+static int utf_safe_read_char_adv(const char **s, size_t *n)
 {
-  int c;
-
   if (*n == 0) {  // end of buffer
     return 0;
   }
 
-  uint8_t k = utf8len_tab_zero[**s];
+  uint8_t k = utf8len_tab_zero[(uint8_t)(**s)];
 
   if (k == 1) {
     // ASCII character or NUL
     (*n)--;
-    return *(*s)++;
+    return (uint8_t)(*(*s)++);
   }
 
   if (k <= *n) {
     // We have a multibyte sequence and it isn't truncated by buffer
     // limits so utf_ptr2char() is safe to use. Or the first byte is
     // illegal (k=0), and it's also safe to use utf_ptr2char().
-    c = utf_ptr2char((char *)(*s));
+    int c = utf_ptr2char(*s);
 
     // On failure, utf_ptr2char() returns the first byte, so here we
     // check equality with the first byte. The only non-ASCII character
     // which equals the first byte of its own UTF-8 representation is
     // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
     // It's safe even if n=1, else we would have k=2 > n.
-    if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
+    if (c != (int)((uint8_t)(**s)) || (c == 0xC3 && (uint8_t)(*s)[1] == 0x83)) {
       // byte sequence was successfully decoded
       *s += k;
       *n -= k;
@@ -705,9 +692,7 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n)
 // Note: composing characters are skipped!
 int mb_ptr2char_adv(const char **const pp)
 {
-  int c;
-
-  c = utf_ptr2char(*pp);
+  int c = utf_ptr2char(*pp);
   *pp += utfc_ptr2len(*pp);
   return c;
 }
@@ -716,9 +701,7 @@ int mb_ptr2char_adv(const char **const pp)
 // Note: composing characters are returned as separate characters.
 int mb_cptr2char_adv(const char **pp)
 {
-  int c;
-
-  c = utf_ptr2char(*pp);
+  int c = utf_ptr2char(*pp);
   *pp += utf_ptr2len(*pp);
   return c;
 }
@@ -728,92 +711,78 @@ int mb_cptr2char_adv(const char **pp)
 /// behaves like a composing character.
 bool utf_composinglike(const char *p1, const char *p2)
 {
-  int c2;
-
-  c2 = utf_ptr2char((char *)p2);
+  int c2 = utf_ptr2char(p2);
   if (utf_iscomposing(c2)) {
     return true;
   }
   if (!arabic_maycombine(c2)) {
     return false;
   }
-  return arabic_combine(utf_ptr2char((char *)p1), c2);
+  return arabic_combine(utf_ptr2char(p1), c2);
 }
 
-/// Convert a UTF-8 string to a wide character
+/// Get the screen char at the beginning of a string
+///
+/// Caller is expected to check for things like unprintable chars etc
+/// If first char in string is a composing char, prepend a space to display it correctly.
 ///
-/// Also gets up to #MAX_MCO composing characters.
+/// If "p" starts with an invalid sequence, zero is returned.
 ///
-/// @param[out]  pcc  Location where to store composing characters. Must have
-///                   space at least for #MAX_MCO + 1 elements.
+/// @param[out] firstc (required) The first codepoint of the screen char,
+///                    or the first byte of an invalid sequence
 ///
-/// @return leading character.
-int utfc_ptr2char(const char *p, int *pcc)
+/// @return the char
+schar_T utfc_ptr2schar(const char *p, int *firstc)
+  FUNC_ATTR_NONNULL_ALL
 {
-  int i = 0;
-
   int c = utf_ptr2char(p);
-  int len = utf_ptr2len(p);
+  *firstc = c;  // NOT optional, you are gonna need it
+  bool first_compose = utf_iscomposing(c);
+  size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+  size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
 
-  // Only accept a composing char when the first char isn't illegal.
-  if ((len > 1 || (uint8_t)(*p) < 0x80)
-      && (uint8_t)p[len] >= 0x80
-      && utf_composinglike(p, p + len)) {
-    int cc = utf_ptr2char(p + len);
-    for (;;) {
-      pcc[i++] = cc;
-      if (i == MAX_MCO) {
-        break;
-      }
-      len += utf_ptr2len(p + len);
-      if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
-        break;
-      }
-    }
-  }
-
-  if (i < MAX_MCO) {    // last composing char must be 0
-    pcc[i] = 0;
+  if (len == 1 && (uint8_t)(*p) >= 0x80) {
+    return 0;  // invalid sequence
   }
 
-  return c;
+  return schar_from_buf_first(p, len, first_compose);
 }
 
-// Convert a UTF-8 byte string to a wide character.  Also get up to MAX_MCO
-// composing characters.  Use no more than p[maxlen].
-//
-// @param [out] pcc: composing chars, last one is 0
-int utfc_ptr2char_len(const char *p, int *pcc, int maxlen)
+/// Get the screen char at the beginning of a string with length
+///
+/// Like utfc_ptr2schar but use no more than p[maxlen].
+schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+  FUNC_ATTR_NONNULL_ALL
 {
   assert(maxlen > 0);
 
-  int i = 0;
+  size_t len = (size_t)utf_ptr2len_len(p, maxlen);
+  if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+    // invalid or truncated sequence
+    *firstc = (uint8_t)(*p);
+    return 0;
+  }
 
-  int len = utf_ptr2len_len(p, maxlen);
-  // Is it safe to use utf_ptr2char()?
-  bool safe = len > 1 && len <= maxlen;
-  int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
+  int c = utf_ptr2char(p);
+  *firstc = c;
+  bool first_compose = utf_iscomposing(c);
+  maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
+  len = (size_t)utfc_ptr2len_len(p, maxlen);
 
-  // Only accept a composing char when the first char isn't illegal.
-  if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
-    for (; i < MAX_MCO; i++) {
-      int len_cc = utf_ptr2len_len(p + len, maxlen - len);
-      safe = len_cc > 1 && len_cc <= maxlen - len;
-      if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
-          || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
-        break;
-      }
-      len += len_cc;
-    }
-  }
+  return schar_from_buf_first(p, len, first_compose);
+}
 
-  if (i < MAX_MCO) {
-    // last composing char must be 0
-    pcc[i] = 0;
+/// Caller must ensure there is space for `first_compose`
+static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
+{
+  if (first_compose) {
+    char cbuf[MAX_SCHAR_SIZE];
+    cbuf[0] = ' ';
+    memcpy(cbuf + 1, buf, len);
+    return schar_from_buf(cbuf, len + 1);
+  } else {
+    return schar_from_buf(buf, len);
   }
-
-  return c;
-#undef ISCOMPOSING
 }
 
 /// Get the length of a UTF-8 byte sequence representing a single codepoint
@@ -854,11 +823,9 @@ int utf_byte2len(int b)
 // Never returns zero.
 int utf_ptr2len_len(const char *p, int size)
 {
-  int len;
-  int i;
   int m;
 
-  len = utf8len_tab[(uint8_t)(*p)];
+  int len = utf8len_tab[(uint8_t)(*p)];
   if (len == 1) {
     return 1;           // NUL, ascii or illegal lead byte
   }
@@ -867,7 +834,7 @@ int utf_ptr2len_len(const char *p, int size)
   } else {
     m = len;
   }
-  for (i = 1; i < m; i++) {
+  for (int i = 1; i < m; i++) {
     if ((p[i] & 0xc0) != 0x80) {
       return 1;
     }
@@ -898,10 +865,9 @@ int utfc_ptr2len(const char *const p)
     return 1;
   }
 
-  // Check for composing characters.  We can handle only the first six, but
-  // skip all of them (otherwise the cursor would get stuck).
+  // Check for composing characters.
   int prevlen = 0;
-  for (;;) {
+  while (true) {
     if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
       return len;
     }
@@ -918,9 +884,6 @@ int utfc_ptr2len(const char *const p)
 /// Returns 1 for an illegal char or an incomplete byte sequence.
 int utfc_ptr2len_len(const char *p, int size)
 {
-  int len;
-  int prevlen;
-
   if (size < 1 || *p == NUL) {
     return 0;
   }
@@ -929,7 +892,7 @@ int utfc_ptr2len_len(const char *p, int size)
   }
 
   // Skip over first UTF-8 char, stopping at a NUL byte.
-  len = utf_ptr2len_len(p, size);
+  int len = utf_ptr2len_len(p, size);
 
   // Check for illegal byte and incomplete byte sequence.
   if ((len == 1 && (uint8_t)p[0] >= 0x80) || len > size) {
@@ -938,17 +901,15 @@ int utfc_ptr2len_len(const char *p, int size)
 
   // Check for composing characters.  We can handle only the first six, but
   // skip all of them (otherwise the cursor would get stuck).
-  prevlen = 0;
+  int prevlen = 0;
   while (len < size) {
-    int len_next_char;
-
     if ((uint8_t)p[len] < 0x80) {
       break;
     }
 
     // Next character length should not go beyond size to ensure that
     // utf_composinglike(...) does not read beyond size.
-    len_next_char = utf_ptr2len_len(p + len, size - len);
+    int len_next_char = utf_ptr2len_len(p + len, size - len);
     if (len_next_char > size - len) {
       break;
     }
@@ -1063,9 +1024,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
 {
   // sorted list of non-overlapping intervals
   static struct clinterval {
-    unsigned int first;
-    unsigned int last;
-    unsigned int class;
+    unsigned first;
+    unsigned last;
+    unsigned cls;
   } classes[] = {
     { 0x037e, 0x037e, 1 },              // Greek question mark
     { 0x0387, 0x0387, 1 },              // Greek ano teleia
@@ -1141,7 +1102,6 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
   };
   int bot = 0;
   int top = ARRAY_SIZE(classes) - 1;
-  int mid;
 
   // First quick check for Latin1 characters, use 'iskeyword'.
   if (c < 0x100) {
@@ -1161,13 +1121,13 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
 
   // binary search in table
   while (top >= bot) {
-    mid = (bot + top) / 2;
-    if (classes[mid].last < (unsigned int)c) {
+    int mid = (bot + top) / 2;
+    if (classes[mid].last < (unsigned)c) {
       bot = mid + 1;
-    } else if (classes[mid].first > (unsigned int)c) {
+    } else if (classes[mid].first > (unsigned)c) {
       top = mid - 1;
     } else {
-      return (int)classes[mid].class;
+      return (int)classes[mid].cls;
     }
   }
 
@@ -1186,13 +1146,12 @@ bool utf_ambiguous_width(int c)
 // the given conversion "table".  Uses binary search on "table".
 static int utf_convert(int a, const convertStruct *const table, size_t n_items)
 {
-  size_t start, mid, end;   // indices into table
-
-  start = 0;
-  end = n_items;
+  // indices into table
+  size_t start = 0;
+  size_t end = n_items;
   while (start < end) {
     // need to search further
-    mid = (end + start) / 2;
+    size_t mid = (end + start) / 2;
     if (table[mid].rangeEnd < a) {
       start = mid + 1;
     } else {
@@ -1285,12 +1244,12 @@ bool mb_isalpha(int a)
   return mb_islower(a) || mb_isupper(a);
 }
 
-static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2)
+static int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
 {
-  int c1, c2, cdiff;
+  int c1, c2;
   char buffer[6];
 
-  for (;;) {
+  while (true) {
     c1 = utf_safe_read_char_adv(&s1, &n1);
     c2 = utf_safe_read_char_adv(&s2, &n2);
 
@@ -1302,7 +1261,7 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2
       continue;
     }
 
-    cdiff = utf_fold(c1) - utf_fold(c2);
+    int cdiff = utf_fold(c1) - utf_fold(c2);
     if (cdiff != 0) {
       return cdiff;
     }
@@ -1326,15 +1285,15 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2
   // to fold just one character to determine the result of comparison.
 
   if (c1 != -1 && c2 == -1) {
-    n1 = (size_t)utf_char2bytes(utf_fold(c1), (char *)buffer);
-    s1 = (char_u *)buffer;
+    n1 = (size_t)utf_char2bytes(utf_fold(c1), buffer);
+    s1 = buffer;
   } else if (c2 != -1 && c1 == -1) {
-    n2 = (size_t)utf_char2bytes(utf_fold(c2), (char *)buffer);
-    s2 = (char_u *)buffer;
+    n2 = (size_t)utf_char2bytes(utf_fold(c2), buffer);
+    s2 = buffer;
   }
 
   while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) {
-    cdiff = (int)(*s1) - (int)(*s2);
+    int cdiff = (int)((uint8_t)(*s1)) - (int)((uint8_t)(*s2));
     if (cdiff != 0) {
       return cdiff;
     }
@@ -1483,11 +1442,11 @@ ssize_t mb_utf_index_to_bytes(const char *s, size_t len, size_t index, bool use_
   FUNC_ATTR_NONNULL_ALL
 {
   size_t count = 0;
-  size_t clen, i;
+  size_t clen;
   if (index == 0) {
     return 0;
   }
-  for (i = 0; i < len; i += clen) {
+  for (size_t i = 0; i < len; i += clen) {
     clen = (size_t)utf_ptr2len_len(s + i, (int)(len - i));
     // NB: gets the byte value of invalid sequence bytes.
     // we only care whether the char fits in the BMP or not
@@ -1512,7 +1471,7 @@ ssize_t mb_utf_index_to_bytes(const char *s, size_t len, size_t index, bool use_
 ///          two characters otherwise.
 int mb_strnicmp(const char *s1, const char *s2, const size_t nn)
 {
-  return utf_strnicmp((char_u *)s1, (char_u *)s2, nn, nn);
+  return utf_strnicmp(s1, s2, nn, nn);
 }
 
 /// Compare strings case-insensitively
@@ -1536,23 +1495,18 @@ int mb_stricmp(const char *s1, const char *s2)
 // 'encoding' has been set to.
 void show_utf8(void)
 {
-  int len;
-  int rlen = 0;
-  char *line;
-  int clen;
-  int i;
-
   // Get the byte length of the char under the cursor, including composing
   // characters.
-  line = get_cursor_pos_ptr();
-  len = utfc_ptr2len(line);
+  char *line = get_cursor_pos_ptr();
+  int len = utfc_ptr2len(line);
   if (len == 0) {
-    msg("NUL");
+    msg("NUL", 0);
     return;
   }
 
-  clen = 0;
-  for (i = 0; i < len; i++) {
+  size_t rlen = 0;
+  int clen = 0;
+  for (int i = 0; i < len; i++) {
     if (clen == 0) {
       // start of (composing) character, get its length
       if (i > 0) {
@@ -1561,16 +1515,17 @@ void show_utf8(void)
       }
       clen = utf_ptr2len(line + i);
     }
-    sprintf(IObuff + rlen, "%02x ",  // NOLINT(runtime/printf)
-            (line[i] == NL) ? NUL : (uint8_t)line[i]);          // NUL is stored as NL
+    assert(IOSIZE > rlen);
+    snprintf(IObuff + rlen, IOSIZE - rlen, "%02x ",
+             (line[i] == NL) ? NUL : (uint8_t)line[i]);  // NUL is stored as NL
     clen--;
-    rlen += (int)strlen(IObuff + rlen);
+    rlen += strlen(IObuff + rlen);
     if (rlen > IOSIZE - 20) {
       break;
     }
   }
 
-  msg(IObuff);
+  msg(IObuff, 0);
 }
 
 /// Return offset from "p" to the start of a character, including composing characters.
@@ -1579,9 +1534,6 @@ void show_utf8(void)
 /// Returns 0 when already at the first byte of a character.
 int utf_head_off(const char *base_in, const char *p_in)
 {
-  int c;
-  int len;
-
   if ((uint8_t)(*p_in) < 0x80) {              // be quick for ASCII
     return 0;
   }
@@ -1603,7 +1555,7 @@ int utf_head_off(const char *base_in, const char *p_in)
     }
     // Check for illegal sequence. Do allow an illegal byte after where we
     // started.
-    len = utf8len_tab[*q];
+    int len = utf8len_tab[*q];
     if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
       return 0;
     }
@@ -1612,7 +1564,7 @@ int utf_head_off(const char *base_in, const char *p_in)
       break;
     }
 
-    c = utf_ptr2char((char *)q);
+    int c = utf_ptr2char((char *)q);
     if (utf_iscomposing(c)) {
       continue;
     }
@@ -1669,7 +1621,7 @@ bool utf_allow_break_before(int cc)
     0x2021,  // ‡ double dagger
     0x2026,  // … horizontal ellipsis
     0x2030,  // ‰ per mille sign
-    0x2031,  // ‱ per then thousand sign
+    0x2031,  // ‱ per the thousand sign
     0x203c,  // ‼ double exclamation mark
     0x2047,  // ⁇ double question mark
     0x2048,  // ⁈ question exclamation mark
@@ -1795,7 +1747,6 @@ int mb_off_next(const char *base, const char *p_in)
 {
   const uint8_t *p = (uint8_t *)p_in;
   int i;
-  int j;
 
   if (*p < 0x80) {              // be quick for ASCII
     return 0;
@@ -1804,6 +1755,7 @@ int mb_off_next(const char *base, const char *p_in)
   // Find the next character that isn't 10xx.xxxx
   for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
   if (i > 0) {
+    int j;
     // Check for illegal sequence.
     for (j = 0; p - j > (uint8_t *)base; j++) {
       if ((p[-j] & 0xc0) != 0x80) {
@@ -1849,33 +1801,35 @@ int utf_cp_tail_off(const char *base, const char *p_in)
 /// Return the offset from "p" to the first byte of the codepoint it points
 /// to. Can start anywhere in a stream of bytes.
 /// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately and returns a negative offset.
+/// separately.
 ///
 /// @param[in] base  Pointer to start of string
 /// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
 //
-/// @return 0 if invalid sequence, else offset to previous codepoint
-int utf_cp_head_off(const char_u *base, const char_u *p)
+/// @return 0 if invalid sequence, else number of bytes to previous codepoint
+int utf_cp_head_off(const char *base, const char *p)
 {
   int i;
-  int j;
 
   if (*p == NUL) {
     return 0;
   }
 
   // Find the first character that is not 10xx.xxxx
-  for (i = 0; p - i > base; i--) {
-    if ((p[i] & 0xc0) != 0x80) {
+  for (i = 0; p - i >= base; i++) {
+    if (((uint8_t)p[-i] & 0xc0) != 0x80) {
       break;
     }
   }
 
-  // Find the last character that is 10xx.xxxx
-  for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {}
+  // Find the last character that is 10xx.xxxx (condition terminates on NUL)
+  int j = 1;
+  while (((uint8_t)p[j] & 0xc0) == 0x80) {
+    j++;
+  }
 
   // Check for illegal sequence.
-  if (utf8len_tab[p[i]] == 1) {
+  if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
     return 0;
   }
   return i;
@@ -1885,8 +1839,6 @@ int utf_cp_head_off(const char_u *base, const char_u *p)
 void utf_find_illegal(void)
 {
   pos_T pos = curwin->w_cursor;
-  char *p;
-  int len;
   vimconv_T vimconv;
   char *tofree = NULL;
 
@@ -1899,8 +1851,8 @@ void utf_find_illegal(void)
   }
 
   curwin->w_cursor.coladd = 0;
-  for (;;) {
-    p = get_cursor_pos_ptr();
+  while (true) {
+    char *p = get_cursor_pos_ptr();
     if (vimconv.vc_type != CONV_NONE) {
       xfree(tofree);
       tofree = string_convert(&vimconv, p, NULL);
@@ -1913,7 +1865,7 @@ void utf_find_illegal(void)
     while (*p != NUL) {
       // Illegal means that there are not enough trail bytes (checked by
       // utf_ptr2len()) or too many of them (overlong sequence).
-      len = utf_ptr2len(p);
+      int len = utf_ptr2len(p);
       if ((uint8_t)(*p) >= 0x80 && (len == 1 || utf_char2len(utf_ptr2char(p)) != len)) {
         if (vimconv.vc_type == CONV_NONE) {
           curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
@@ -1948,16 +1900,16 @@ theend:
 
 /// @return  true if string "s" is a valid utf-8 string.
 /// When "end" is NULL stop at the first NUL.  Otherwise stop at "end".
-bool utf_valid_string(const char_u *s, const char_u *end)
+bool utf_valid_string(const char *s, const char *end)
 {
-  const char_u *p = s;
+  const uint8_t *p = (uint8_t *)s;
 
-  while (end == NULL ? *p != NUL : p < end) {
+  while (end == NULL ? *p != NUL : p < (uint8_t *)end) {
     int l = utf8len_tab_zero[*p];
     if (l == 0) {
       return false;  // invalid lead byte
     }
-    if (end != NULL && p + l > end) {
+    if (end != NULL && p + l > (uint8_t *)end) {
       return false;  // incomplete byte sequence
     }
     p++;
@@ -1988,7 +1940,7 @@ void mb_check_adjust_col(void *win_)
 
   // Column 0 is always valid.
   if (oldcol != 0) {
-    char *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
+    char *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum);
     colnr_T len = (colnr_T)strlen(p);
 
     // Empty line or invalid column?
@@ -2042,6 +1994,24 @@ int mb_charlen(const char *str)
   return count;
 }
 
+int mb_charlen2bytelen(const char *str, int charlen)
+{
+  const char *p = str;
+  int count = 0;
+
+  if (p == NULL) {
+    return 0;
+  }
+
+  for (int i = 0; *p != NUL && i < charlen; i++) {
+    int b = utfc_ptr2len(p);
+    p += b;
+    count += b;
+  }
+
+  return count;
+}
+
 /// Like mb_charlen() but for a string with specified length.
 int mb_charlen_len(const char *str, int len)
 {
@@ -2122,7 +2092,6 @@ char *enc_skip(char *p)
 char *enc_canonize(char *enc)
   FUNC_ATTR_NONNULL_RET
 {
-  char *p, *s;
   if (strcmp(enc, "default") == 0) {
     // Use the default encoding as found by set_init_1().
     return xstrdup(fenc_default);
@@ -2131,8 +2100,8 @@ char *enc_canonize(char *enc)
   // copy "enc" to allocated memory, with room for two '-'
   char *r = xmalloc(strlen(enc) + 3);
   // Make it all lower case and replace '_' with '-'.
-  p = r;
-  for (s = enc; *s != NUL; s++) {
+  char *p = r;
+  for (char *s = enc; *s != NUL; s++) {
     if (*s == '_') {
       *p++ = '-';
     } else {
@@ -2184,9 +2153,7 @@ char *enc_canonize(char *enc)
 /// Returns -1 when not found.
 static int enc_alias_search(const char *name)
 {
-  int i;
-
-  for (i = 0; enc_alias_table[i].name != NULL; i++) {
+  for (int i = 0; enc_alias_table[i].name != NULL; i++) {
     if (strcmp(name, enc_alias_table[i].name) == 0) {
       return enc_alias_table[i].canon;
     }
@@ -2210,10 +2177,7 @@ char *enc_locale(void)
   if (!(s = nl_langinfo(CODESET)) || *s == NUL)
 #endif
   {
-#if defined(HAVE_LOCALE_H)
-    if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL)
-#endif
-    {
+    if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL) {
       if ((s = os_getenv("LC_ALL"))) {
         if ((s = os_getenv("LC_CTYPE"))) {
           s = os_getenv("LANG");
@@ -2269,17 +2233,14 @@ enc_locale_copy_enc:
 // (should return iconv_t, but that causes problems with prototypes).
 void *my_iconv_open(char *to, char *from)
 {
-  iconv_t fd;
 #define ICONV_TESTLEN 400
   char tobuf[ICONV_TESTLEN];
-  char *p;
-  size_t tolen;
   static WorkingStatus iconv_working = kUnknown;
 
   if (iconv_working == kBroken) {
     return (void *)-1;          // detected a broken iconv() previously
   }
-  fd = iconv_open(enc_skip(to), enc_skip(from));
+  iconv_t fd = iconv_open(enc_skip(to), enc_skip(from));
 
   if (fd != (iconv_t)-1 && iconv_working == kUnknown) {
     // Do a dummy iconv() call to check if it actually works.  There is a
@@ -2287,8 +2248,8 @@ void *my_iconv_open(char *to, char *from)
     // because it's wide-spread.  The symptoms are that after outputting
     // the initial shift state the "to" pointer is NULL and conversion
     // stops for no apparent reason after about 8160 characters.
-    p = tobuf;
-    tolen = ICONV_TESTLEN;
+    char *p = tobuf;
+    size_t tolen = ICONV_TESTLEN;
     (void)iconv(fd, NULL, NULL, &p, &tolen);
     if (p == NULL) {
       iconv_working = kBroken;
@@ -2310,24 +2271,19 @@ void *my_iconv_open(char *to, char *from)
 static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t slen,
                           size_t *unconvlenp, size_t *resultlenp)
 {
-  const char *from;
-  size_t fromlen;
   char *to;
-  size_t tolen;
   size_t len = 0;
   size_t done = 0;
   char *result = NULL;
-  char *p;
-  int l;
 
-  from = str;
-  fromlen = slen;
-  for (;;) {
+  const char *from = str;
+  size_t fromlen = slen;
+  while (true) {
     if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) {
       // Allocate enough room for most conversions.  When re-allocating
       // increase the buffer size.
       len = len + fromlen * 2 + 40;
-      p = xmalloc(len);
+      char *p = xmalloc(len);
       if (done > 0) {
         memmove(p, result, done);
       }
@@ -2336,7 +2292,7 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl
     }
 
     to = result + done;
-    tolen = len - done - 2;
+    size_t tolen = len - done - 2;
     // Avoid a warning for systems with a wrong iconv() prototype by
     // casting the second argument to void *.
     if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) {
@@ -2366,7 +2322,7 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl
       if (utf_ptr2cells(from) > 1) {
         *to++ = '?';
       }
-      l = utfc_ptr2len_len(from, (int)fromlen);
+      int l = utfc_ptr2len_len(from, (int)fromlen);
       from += l;
       fromlen -= (size_t)l;
     } else if (ICONV_ERRNO != ICONV_E2BIG) {
@@ -2384,6 +2340,34 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl
   return result;
 }
 
+/// iconv() function
+void f_iconv(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
+{
+  vimconv_T vimconv;
+
+  rettv->v_type = VAR_STRING;
+  rettv->vval.v_string = NULL;
+
+  const char *const str = tv_get_string(&argvars[0]);
+  char buf1[NUMBUFLEN];
+  char *const from = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[1], buf1)));
+  char buf2[NUMBUFLEN];
+  char *const to = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[2], buf2)));
+  vimconv.vc_type = CONV_NONE;
+  convert_setup(&vimconv, from, to);
+
+  // If the encodings are equal, no conversion needed.
+  if (vimconv.vc_type == CONV_NONE) {
+    rettv->vval.v_string = xstrdup(str);
+  } else {
+    rettv->vval.v_string = string_convert(&vimconv, (char *)str, NULL);
+  }
+
+  convert_setup(&vimconv, NULL, NULL);
+  xfree(from);
+  xfree(to);
+}
+
 /// Setup "vcp" for conversion from "from" to "to".
 /// The names must have been made canonical with enc_canonize().
 /// vcp->vc_type must have been initialized to CONV_NONE.
@@ -2402,8 +2386,6 @@ int convert_setup(vimconv_T *vcp, char *from, char *to)
 int convert_setup_ext(vimconv_T *vcp, char *from, bool from_unicode_is_utf8, char *to,
                       bool to_unicode_is_utf8)
 {
-  int from_prop;
-  int to_prop;
   int from_is_utf8;
   int to_is_utf8;
 
@@ -2419,8 +2401,8 @@ int convert_setup_ext(vimconv_T *vcp, char *from, bool from_unicode_is_utf8, cha
     return OK;
   }
 
-  from_prop = enc_canon_props(from);
-  to_prop = enc_canon_props(to);
+  int from_prop = enc_canon_props(from);
+  int to_prop = enc_canon_props(to);
   if (from_unicode_is_utf8) {
     from_is_utf8 = from_prop & ENC_UNICODE;
   } else {
@@ -2477,9 +2459,8 @@ char *string_convert(const vimconv_T *const vcp, char *ptr, size_t *lenp)
 // set to the number of remaining bytes.
 char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, size_t *unconvlenp)
 {
-  char_u *retval = NULL;
-  char_u *d;
-  int l;
+  uint8_t *retval = NULL;
+  uint8_t *d;
   int c;
 
   size_t len;
@@ -2499,10 +2480,10 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
     for (size_t i = 0; i < len; i++) {
       c = (uint8_t)ptr[i];
       if (c < 0x80) {
-        *d++ = (char_u)c;
+        *d++ = (uint8_t)c;
       } else {
-        *d++ = (char_u)(0xc0 + (char_u)((unsigned)c >> 6));
-        *d++ = (char_u)(0x80 + (c & 0x3f));
+        *d++ = (uint8_t)(0xc0 + (uint8_t)((unsigned)c >> 6));
+        *d++ = (uint8_t)(0x80 + (c & 0x3f));
       }
     }
     *d = NUL;
@@ -2547,7 +2528,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
     retval = xmalloc(len + 1);
     d = retval;
     for (size_t i = 0; i < len; i++) {
-      l = utf_ptr2len_len(ptr + i, (int)(len - i));
+      int l = utf_ptr2len_len(ptr + i, (int)(len - i));
       if (l == 0) {
         *d++ = NUL;
       } else if (l == 1) {
@@ -2597,7 +2578,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
         }
         if (!utf_iscomposing(c)) {              // skip composing chars
           if (c < 0x100) {
-            *d++ = (char_u)c;
+            *d++ = (uint8_t)c;
           } else if (vcp->vc_fail) {
             xfree(retval);
             return NULL;
@@ -2618,7 +2599,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
     break;
 
   case CONV_ICONV:  // conversion with vcp->vc_fd
-    retval = (char_u *)iconv_string(vcp, ptr, len, unconvlenp, lenp);
+    retval = (uint8_t *)iconv_string(vcp, ptr, len, unconvlenp, lenp);
     break;
   }
 
@@ -2627,8 +2608,8 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
 
 /// Table set by setcellwidths().
 typedef struct {
-  long first;
-  long last;
+  int64_t first;
+  int64_t last;
   char width;
 } cw_interval_T;
 
@@ -2753,7 +2734,7 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
     const listitem_T *lili = tv_list_first(li_l);
     const varnumber_T n1 = TV_LIST_ITEM_TV(lili)->vval.v_number;
     if (item > 0 && n1 <= table[item - 1].last) {
-      semsg(_(e_overlapping_ranges_for_nr), (long)n1);
+      semsg(_(e_overlapping_ranges_for_nr), (size_t)n1);
       xfree((void *)ptrs);
       xfree(table);
       return;
@@ -2810,3 +2791,14 @@ void f_charclass(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
   }
   rettv->vval.v_number = mb_get_class(argvars[0].vval.v_string);
 }
+
+/// Function given to ExpandGeneric() to obtain the possible arguments of the
+/// encoding options.
+char *get_encoding_name(expand_T *xp FUNC_ATTR_UNUSED, int idx)
+{
+  if (idx >= (int)ARRAY_SIZE(enc_canon_table)) {
+    return NULL;
+  }
+
+  return (char *)enc_canon_table[idx].name;
+}