1 files changed, 125 insertions, 129 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 5eb209a6f6..a9792cf1b9 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -334,10 +334,9 @@ enc_alias_table[] =
  * Returns -1 if not found.
  */
 static int enc_canon_search(const char_u *name)
+  FUNC_ATTR_PURE
 {
-  int i;
-
-  for (i = 0; i < IDX_COUNT; ++i) {
+  for (int i = 0; i < IDX_COUNT; i++) {
     if (STRCMP(name, enc_canon_table[i].name) == 0) {
       return i;
     }
@@ -345,16 +344,14 @@ static int enc_canon_search(const char_u *name)
   return -1;
 }
 
-
 /*
  * Find canonical encoding "name" in the list and return its properties.
  * Returns 0 if not found.
  */
 int enc_canon_props(const char_u *name)
+  FUNC_ATTR_PURE
 {
-  int i;
-
-  i = enc_canon_search(name);
+  int i = enc_canon_search(name);
   if (i >= 0) {
     return enc_canon_table[i].prop;
   } else if (STRNCMP(name, "2byte-", 6) == 0) {
@@ -373,6 +370,7 @@ int enc_canon_props(const char_u *name)
  * 3 - UTF-8 BOM
  */
 int bomb_size(void)
+  FUNC_ATTR_PURE
 {
   int n = 0;
 
@@ -414,11 +412,13 @@ void remove_bom(char_u *s)
  * >2 for other word characters
  */
 int mb_get_class(const char_u *p)
+  FUNC_ATTR_PURE
 {
   return mb_get_class_tab(p, curbuf->b_chartab);
 }
 
 int mb_get_class_tab(const char_u *p, const uint64_t *const chartab)
+  FUNC_ATTR_PURE
 {
   if (MB_BYTE2LEN(p[0]) == 1) {
     if (p[0] == NUL || ascii_iswhite(p[0])) {
@@ -429,13 +429,14 @@ int mb_get_class_tab(const char_u *p, const uint64_t *const chartab)
     }
     return 1;
   }
-  return utf_class_tab(utf_ptr2char(p), chartab);
+  return utf_class_tab(utf_ptr2char((char *)p), chartab);
 }
 
 /*
  * Return true if "c" is in "table".
  */
 static bool intable(const struct interval *table, size_t n_items, int c)
+  FUNC_ATTR_PURE
 {
   int mid, bot, top;
 
@@ -471,27 +472,12 @@ static bool intable(const struct interval *table, size_t n_items, int c)
 int utf_char2cells(int c)
 {
   if (c >= 0x100) {
-#ifdef USE_WCHAR_FUNCTIONS
-    //
-    // Assume the library function wcwidth() works better than our own
-    // stuff.  It should return 1 for ambiguous width chars!
-    //
-    int n = wcwidth(c);
-
-    if (n < 0) {
-      return 6;                 // unprintable, displays <xxxx>
-    }
-    if (n > 1) {
-      return n;
-    }
-#else
     if (!utf_printable(c)) {
       return 6;                 // unprintable, displays <xxxx>
     }
     if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
       return 2;
     }
-#endif
     if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) {
       return 2;
     }
@@ -510,12 +496,12 @@ int utf_char2cells(int c)
 
 /// Return the number of display cells character at "*p" occupies.
 /// This doesn't take care of unprintable characters, use ptr2cells() for that.
-int utf_ptr2cells(const char_u *p)
+int utf_ptr2cells(const char *p)
 {
   int c;
 
   // Need to convert to a character number.
-  if (*p >= 0x80) {
+  if ((uint8_t)(*p) >= 0x80) {
     c = utf_ptr2char(p);
     // An illegal byte is displayed as <xx>.
     if (utf_ptr2len(p) == 1 || c == NUL) {
@@ -541,9 +527,9 @@ int utf_ptr2cells_len(const char_u *p, int size)
     if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) {
       return 1;        // truncated
     }
-    c = utf_ptr2char(p);
+    c = utf_ptr2char((char *)p);
     // An illegal byte is displayed as <xx>.
-    if (utf_ptr2len(p) == 1 || c == NUL) {
+    if (utf_ptr2len((char *)p) == 1 || c == NUL) {
       return 4;
     }
     // If the char is ASCII it must be an overlong sequence.
@@ -560,12 +546,12 @@ int utf_ptr2cells_len(const char_u *p, int size)
 /// @param str The source string, may not be NULL, must be a NUL-terminated
 ///            string.
 /// @return The number of cells occupied by string `str`
-size_t mb_string2cells(const char_u *str)
+size_t mb_string2cells(const char *str)
 {
   size_t clen = 0;
 
-  for (const char_u *p = str; *p != NUL; p += utfc_ptr2len(p)) {
-    clen += utf_ptr2cells(p);
+  for (const char_u *p = (char_u *)str; *p != NUL; p += utfc_ptr2len((char *)p)) {
+    clen += utf_ptr2cells((char *)p);
   }
 
   return clen;
@@ -577,14 +563,14 @@ size_t mb_string2cells(const char_u *str)
 ///            string.
 /// @param size maximum length of string. It will terminate on earlier NUL.
 /// @return The number of cells occupied by string `str`
-size_t mb_string2cells_len(const char_u *str, size_t size)
+size_t mb_string2cells_len(const char *str, size_t size)
   FUNC_ATTR_NONNULL_ARG(1)
 {
   size_t clen = 0;
 
-  for (const char_u *p = str; *p != NUL && p < str+size;
-       p += utfc_ptr2len_len(p, size+(p-str))) {
-    clen += utf_ptr2cells(p);
+  for (const char_u *p = (char_u *)str; *p != NUL && p < (char_u *)str + size;
+       p += utfc_ptr2len_len(p, size + (p - (char_u *)str))) {
+    clen += utf_ptr2cells((char *)p);
   }
 
   return clen;
@@ -600,9 +586,10 @@ size_t mb_string2cells_len(const char_u *str, size_t size)
 /// @param[in]  p  String to convert.
 ///
 /// @return Unicode codepoint or byte value.
-int utf_ptr2char(const char_u *const p)
+int utf_ptr2char(const char *const p_in)
   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
 {
+  uint8_t *p = (uint8_t *)p_in;
   if (p[0] < 0x80) {  // Be quick for ASCII.
     return p[0];
   }
@@ -677,7 +664,7 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n)
     // We have a multibyte sequence and it isn't truncated by buffer
     // limits so utf_ptr2char() is safe to use. Or the first byte is
     // illegal (k=0), and it's also safe to use utf_ptr2char().
-    c = utf_ptr2char(*s);
+    c = utf_ptr2char((char *)(*s));
 
     // On failure, utf_ptr2char() returns the first byte, so here we
     // check equality with the first byte. The only non-ASCII character
@@ -704,8 +691,8 @@ int mb_ptr2char_adv(const char_u **const pp)
 {
   int c;
 
-  c = utf_ptr2char(*pp);
-  *pp += utfc_ptr2len(*pp);
+  c = utf_ptr2char((char *)(*pp));
+  *pp += utfc_ptr2len((char *)(*pp));
   return c;
 }
 
@@ -717,8 +704,8 @@ int mb_cptr2char_adv(const char_u **pp)
 {
   int c;
 
-  c = utf_ptr2char(*pp);
-  *pp += utf_ptr2len(*pp);
+  c = utf_ptr2char((char *)(*pp));
+  *pp += utf_ptr2len((char *)(*pp));
   return c;
 }
 
@@ -731,14 +718,14 @@ bool utf_composinglike(const char_u *p1, const char_u *p2)
 {
   int c2;
 
-  c2 = utf_ptr2char(p2);
+  c2 = utf_ptr2char((char *)p2);
   if (utf_iscomposing(c2)) {
     return true;
   }
   if (!arabic_maycombine(c2)) {
     return false;
   }
-  return arabic_combine(utf_ptr2char(p1), c2);
+  return arabic_combine(utf_ptr2char((char *)p1), c2);
 }
 
 /// Convert a UTF-8 string to a wide character
@@ -756,21 +743,21 @@ int utfc_ptr2char(const char_u *p, int *pcc)
   int cc;
   int i = 0;
 
-  c = utf_ptr2char(p);
-  len = utf_ptr2len(p);
+  c = utf_ptr2char((char *)p);
+  len = utf_ptr2len((char *)p);
 
   // Only accept a composing char when the first char isn't illegal.
   if ((len > 1 || *p < 0x80)
       && p[len] >= 0x80
       && utf_composinglike(p, p + len)) {
-    cc = utf_ptr2char(p + len);
+    cc = utf_ptr2char((char *)p + len);
     for (;;) {
       pcc[i++] = cc;
       if (i == MAX_MCO) {
         break;
       }
-      len += utf_ptr2len(p + len);
-      if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
+      len += utf_ptr2len((char *)p + len);
+      if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char((char *)p + len))) {
         break;
       }
     }
@@ -798,15 +785,15 @@ int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
   int len = utf_ptr2len_len(p, maxlen);
   // Is it safe to use utf_ptr2char()?
   bool safe = len > 1 && len <= maxlen;
-  int c = safe ? utf_ptr2char(p) : *p;
+  int c = safe ? utf_ptr2char((char *)p) : *p;
 
   // Only accept a composing char when the first char isn't illegal.
   if ((safe || c < 0x80) && len < maxlen && p[len] >= 0x80) {
     for (; i < MAX_MCO; i++) {
       int len_cc = utf_ptr2len_len(p + len, maxlen - len);
       safe = len_cc > 1 && len_cc <= maxlen - len;
-      if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
-          || !(i == 0 ? utf_composinglike(p, p+len) : utf_iscomposing(pcc[i]))) {
+      if (!safe || (pcc[i] = utf_ptr2char((char *)p + len)) < 0x80
+          || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
         break;
       }
       len += len_cc;
@@ -828,9 +815,10 @@ int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
 ///
 /// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte
 ///         sequence.
-int utf_ptr2len(const char_u *const p)
+int utf_ptr2len(const char *const p_in)
   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
 {
+  uint8_t *p = (uint8_t *)p_in;
   if (*p == NUL) {
     return 0;
   }
@@ -887,10 +875,11 @@ int utf_ptr2len_len(const char_u *p, int size)
 /// Return the number of bytes occupied by a UTF-8 character in a string
 ///
 /// This includes following composing characters.
-int utfc_ptr2len(const char_u *const p)
+int utfc_ptr2len(const char *const p_in)
   FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
 {
-  uint8_t b0 = (uint8_t)(*p);
+  uint8_t *p = (uint8_t *)p_in;
+  uint8_t b0 = *p;
 
   if (b0 == NUL) {
     return 0;
@@ -900,7 +889,7 @@ int utfc_ptr2len(const char_u *const p)
   }
 
   // Skip over first UTF-8 char, stopping at a NUL byte.
-  int len = utf_ptr2len(p);
+  int len = utf_ptr2len((char *)p);
 
   // Check for illegal byte.
   if (len == 1 && b0 >= 0x80) {
@@ -917,7 +906,7 @@ int utfc_ptr2len(const char_u *const p)
 
     // Skip over composing char.
     prevlen = len;
-    len += utf_ptr2len(p + len);
+    len += utf_ptr2len((char *)p + len);
   }
 }
 
@@ -1002,7 +991,7 @@ int utf_char2len(const int c)
 /// @param c character to convert to \p buf
 /// @param[out] buf UTF-8 string generated from \p c, does not add \0
 /// @return Number of bytes (1-6).
-int utf_char2bytes(const int c, char_u *const buf)
+int utf_char2bytes(const int c, char *const buf)
 {
   if (c < 0x80) {  // 7 bits
     buf[0] = c;
@@ -1056,23 +1045,16 @@ bool utf_iscomposing(int c)
  */
 bool utf_printable(int c)
 {
-#ifdef USE_WCHAR_FUNCTIONS
-  /*
-   * Assume the iswprint() library function works better than our own stuff.
-   */
-  return iswprint(c);
-#else
   // Sorted list of non-overlapping intervals.
   // 0xd800-0xdfff is reserved for UTF-16, actually illegal.
   static struct interval nonprint[] =
   {
     { 0x070f, 0x070f }, { 0x180b, 0x180e }, { 0x200b, 0x200f }, { 0x202a, 0x202e },
-    { 0x206a, 0x206f }, { 0xd800, 0xdfff }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb },
+    { 0x2060, 0x206f }, { 0xd800, 0xdfff }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb },
     { 0xfffe, 0xffff }
   };
 
   return !intable(nonprint, ARRAY_SIZE(nonprint), c);
-#endif
 }
 
 /*
@@ -1087,6 +1069,7 @@ int utf_class(const int c)
 }
 
 int utf_class_tab(const int c, const uint64_t *const chartab)
+  FUNC_ATTR_PURE
 {
   // sorted list of non-overlapping intervals
   static struct clinterval {
@@ -1317,10 +1300,16 @@ bool mb_isupper(int a)
   return mb_tolower(a) != a;
 }
 
+bool mb_isalpha(int a)
+  FUNC_ATTR_WARN_UNUSED_RESULT
+{
+  return mb_islower(a) || mb_isupper(a);
+}
+
 static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2)
 {
   int c1, c2, cdiff;
-  char_u buffer[6];
+  char buffer[6];
 
   for (;;) {
     c1 = utf_safe_read_char_adv(&s1, &n1);
@@ -1358,11 +1347,11 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2
   // to fold just one character to determine the result of comparison.
 
   if (c1 != -1 && c2 == -1) {
-    n1 = utf_char2bytes(utf_fold(c1), buffer);
-    s1 = buffer;
+    n1 = utf_char2bytes(utf_fold(c1), (char *)buffer);
+    s1 = (char_u *)buffer;
   } else if (c2 != -1 && c1 == -1) {
-    n2 = utf_char2bytes(utf_fold(c2), buffer);
-    s2 = buffer;
+    n2 = utf_char2bytes(utf_fold(c2), (char *)buffer);
+    s2 = (char_u *)buffer;
   }
 
   while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) {
@@ -1498,10 +1487,10 @@ void mb_utflen(const char_u *s, size_t len, size_t *codepoints, size_t *codeunit
   size_t count = 0, extra = 0;
   size_t clen;
   for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
-    clen = utf_ptr2len_len(s+i, len-i);
+    clen = utf_ptr2len_len(s + i, len - i);
     // NB: gets the byte value of invalid sequence bytes.
     // we only care whether the char fits in the BMP or not
-    int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
+    int c = (clen > 1) ? utf_ptr2char((char *)s + i) : s[i];
     count++;
     if (c > 0xFFFF) {
       extra++;
@@ -1520,22 +1509,21 @@ ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len, size_t index, bool us
     return 0;
   }
   for (i = 0; i < len && s[i] != NUL; i += clen) {
-    clen = utf_ptr2len_len(s+i, len-i);
+    clen = utf_ptr2len_len(s + i, len - i);
     // NB: gets the byte value of invalid sequence bytes.
     // we only care whether the char fits in the BMP or not
-    int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
+    int c = (clen > 1) ? utf_ptr2char((char *)s + i) : s[i];
     count++;
     if (use_utf16_units && c > 0xFFFF) {
       count++;
     }
     if (count >= index) {
-      return i+clen;
+      return i + clen;
     }
   }
   return -1;
 }
 
-
 /*
  * Version of strnicmp() that handles multi-byte characters.
  * Needed for Big5, Shift-JIS and UTF-8 encoding.  Other DBCS encodings can
@@ -1581,7 +1569,7 @@ void show_utf8(void)
   // Get the byte length of the char under the cursor, including composing
   // characters.
   line = get_cursor_pos_ptr();
-  len = utfc_ptr2len(line);
+  len = utfc_ptr2len((char *)line);
   if (len == 0) {
     msg("NUL");
     return;
@@ -1595,7 +1583,7 @@ void show_utf8(void)
         STRCPY(IObuff + rlen, "+ ");
         rlen += 2;
       }
-      clen = utf_ptr2len(line + i);
+      clen = utf_ptr2len((char *)line + i);
     }
     sprintf((char *)IObuff + rlen, "%02x ",
             (line[i] == NL) ? NUL : line[i]);          // NUL is stored as NL
@@ -1645,7 +1633,7 @@ int utf_head_off(const char_u *base, const char_u *p)
       break;
     }
 
-    c = utf_ptr2char(q);
+    c = utf_ptr2char((char *)q);
     if (utf_iscomposing(c)) {
       continue;
     }
@@ -1658,7 +1646,7 @@ int utf_head_off(const char_u *base, const char_u *p)
       while (j > base && (*j & 0xc0) == 0x80) {
         --j;
       }
-      if (arabic_combine(utf_ptr2char(j), c)) {
+      if (arabic_combine(utf_ptr2char((char *)j), c)) {
         continue;
       }
     }
@@ -1814,19 +1802,17 @@ bool utf_allow_break(int cc, int ncc)
 /// @param[in,out]  tp  Destination to copy to.
 void mb_copy_char(const char_u **const fp, char_u **const tp)
 {
-  const size_t l = (size_t)utfc_ptr2len(*fp);
+  const size_t l = (size_t)utfc_ptr2len((char *)(*fp));
 
   memmove(*tp, *fp, l);
   *tp += l;
   *fp += l;
 }
 
-/*
- * Return the offset from "p" to the first byte of a character.  When "p" is
- * at the start of a character 0 is returned, otherwise the offset to the next
- * character.  Can start anywhere in a stream of bytes.
- */
-int mb_off_next(char_u *base, char_u *p)
+/// Return the offset from "p" to the first byte of a character.  When "p" is
+/// at the start of a character 0 is returned, otherwise the offset to the next
+/// character.  Can start anywhere in a stream of bytes.
+int mb_off_next(const char_u *base, const char_u *p)
 {
   int i;
   int j;
@@ -1854,8 +1840,9 @@ int mb_off_next(char_u *base, char_u *p)
 /// Return the offset from "p" to the last byte of the character it points
 /// into.  Can start anywhere in a stream of bytes.
 /// Composing characters are not included.
-int mb_tail_off(char_u *base, char_u *p)
+int mb_tail_off(const char *base, const char *p_in)
 {
+  const uint8_t *p = (uint8_t *)p_in;
   int i;
   int j;
 
@@ -1867,7 +1854,7 @@ int mb_tail_off(char_u *base, char_u *p)
   for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
 
   // Check for illegal sequence.
-  for (j = 0; p - j > base; j++) {
+  for (j = 0; p_in - j > base; j++) {
     if ((p[-j] & 0xc0) != 0x80) {
       break;
     }
@@ -1879,15 +1866,15 @@ int mb_tail_off(char_u *base, char_u *p)
   return i;
 }
 
-
 /// Return the offset from "p" to the first byte of the character it points
 /// into. Can start anywhere in a stream of bytes.
+/// Unlike utf_head_off() this doesn't include composing characters and returns a negative value.
 ///
 /// @param[in] base  Pointer to start of string
 /// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
 //
 /// @return 0 if invalid sequence, else offset to previous codepoint
-int mb_head_off(char_u *base, char_u *p)
+int mb_head_off(const char_u *base, const char_u *p)
 {
   int i;
   int j;
@@ -1947,9 +1934,9 @@ void utf_find_illegal(void)
     while (*p != NUL) {
       // Illegal means that there are not enough trail bytes (checked by
       // utf_ptr2len()) or too many of them (overlong sequence).
-      len = utf_ptr2len(p);
+      len = utf_ptr2len((char *)p);
       if (*p >= 0x80 && (len == 1
-                         || utf_char2len(utf_ptr2char(p)) != len)) {
+                         || utf_char2len(utf_ptr2char((char *)p)) != len)) {
         if (vimconv.vc_type == CONV_NONE) {
           curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
         } else {
@@ -1957,7 +1944,7 @@ void utf_find_illegal(void)
 
           len = (int)(p - tofree);
           for (p = get_cursor_pos_ptr(); *p != NUL && len-- > 0; p += l) {
-            l = utf_ptr2len(p);
+            l = utf_ptr2len((char *)p);
             curwin->w_cursor.col += l;
           }
         }
@@ -1981,6 +1968,31 @@ theend:
   convert_setup(&vimconv, NULL, NULL);
 }
 
+/// @return  true if string "s" is a valid utf-8 string.
+/// When "end" is NULL stop at the first NUL.
+/// When "end" is positive stop there.
+bool utf_valid_string(const char_u *s, const char_u *end)
+{
+  const char_u *p = s;
+
+  while (end == NULL ? *p != NUL : p < end) {
+    int l = utf8len_tab_zero[*p];
+    if (l == 0) {
+      return false;  // invalid lead byte
+    }
+    if (end != NULL && p + l > end) {
+      return false;  // incomplete byte sequence
+    }
+    p++;
+    while (--l > 0) {
+      if ((*p++ & 0xc0) != 0x80) {
+        return false;  // invalid trail byte
+      }
+    }
+  }
+  return true;
+}
+
 /*
  * If the cursor moves on an trail byte, set the cursor on the lead byte.
  * Thus it moves left if necessary.
@@ -2001,7 +2013,7 @@ void mb_check_adjust_col(void *win_)
 
   // Column 0 is always valid.
   if (oldcol != 0) {
-    char_u *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
+    char *p = (char *)ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
     colnr_T len = (colnr_T)STRLEN(p);
 
     // Empty line or invalid column?
@@ -2013,7 +2025,7 @@ void mb_check_adjust_col(void *win_)
         win->w_cursor.col = len - 1;
       }
       // Move the cursor to the head byte.
-      win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col);
+      win->w_cursor.col -= utf_head_off((char_u *)p, (char_u *)p + win->w_cursor.col);
     }
 
     // Reset `coladd` when the cursor would be on the right half of a
@@ -2037,13 +2049,11 @@ char_u *mb_prevptr(char_u *line, char_u *p)
   return p;
 }
 
-/*
- * Return the character length of "str".  Each multi-byte character (with
- * following composing characters) counts as one.
- */
-int mb_charlen(char_u *str)
+/// Return the character length of "str".  Each multi-byte character (with
+/// following composing characters) counts as one.
+int mb_charlen(const char_u *str)
 {
-  char_u *p = str;
+  const char_u *p = str;
   int count;
 
   if (p == NULL) {
@@ -2051,22 +2061,20 @@ int mb_charlen(char_u *str)
   }
 
   for (count = 0; *p != NUL; count++) {
-    p += utfc_ptr2len(p);
+    p += utfc_ptr2len((char *)p);
   }
 
   return count;
 }
 
-/*
- * Like mb_charlen() but for a string with specified length.
- */
-int mb_charlen_len(char_u *str, int len)
+/// Like mb_charlen() but for a string with specified length.
+int mb_charlen_len(const char_u *str, int len)
 {
-  char_u *p = str;
+  const char_u *p = str;
   int count;
 
   for (count = 0; *p != NUL && p < str + len; count++) {
-    p += utfc_ptr2len(p);
+    p += utfc_ptr2len((char *)p);
   }
 
   return count;
@@ -2089,8 +2097,7 @@ const char *mb_unescape(const char **const pp)
   size_t buf_idx = 0;
   uint8_t *str = (uint8_t *)(*pp);
 
-  // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI
-  // KS_EXTRA KE_CSI to CSI.
+  // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL.
   // Maximum length of a utf-8 character is 4 bytes.
   for (size_t str_idx = 0; str[str_idx] != NUL && buf_idx < 4; str_idx++) {
     if (str[str_idx] == K_SPECIAL
@@ -2098,11 +2105,6 @@ const char *mb_unescape(const char **const pp)
         && str[str_idx + 2] == KE_FILLER) {
       buf[buf_idx++] = (char)K_SPECIAL;
       str_idx += 2;
-    } else if ((str[str_idx] == K_SPECIAL)
-               && str[str_idx + 1] == KS_EXTRA
-               && str[str_idx + 2] == KE_CSI) {
-      buf[buf_idx++] = (char)CSI;
-      str_idx += 2;
     } else if (str[str_idx] == K_SPECIAL) {
       break;  // A special key can't be a multibyte char.
     } else {
@@ -2112,7 +2114,7 @@ const char *mb_unescape(const char **const pp)
 
     // Return a multi-byte character if it's found.  An illegal sequence
     // will result in a 1 here.
-    if (utf_ptr2len((const char_u *)buf) > 1) {
+    if (utf_ptr2len(buf) > 1) {
       *pp = (const char *)str + str_idx + 1;
       return buf;
     }
@@ -2125,7 +2127,6 @@ const char *mb_unescape(const char **const pp)
   return NULL;
 }
 
-
 /*
  * Skip the Vim specific head of a 'encoding' name.
  */
@@ -2207,11 +2208,9 @@ char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET
   return r;
 }
 
-/*
- * Search for an encoding alias of "name".
- * Returns -1 when not found.
- */
-static int enc_alias_search(char_u *name)
+/// Search for an encoding alias of "name".
+/// Returns -1 when not found.
+static int enc_alias_search(const char_u *name)
 {
   int i;
 
@@ -2223,7 +2222,6 @@ static int enc_alias_search(char_u *name)
   return -1;
 }
 
-
 #ifdef HAVE_LANGINFO_H
 # include <langinfo.h>
 #endif
@@ -2265,7 +2263,7 @@ char_u *enc_locale(void)
   // Make the name lowercase and replace '_' with '-'.
   // Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn",
   // "ko_KR.EUC" == "euc-kr"
-  const char *p = (char *)vim_strchr((char_u *)s, '.');
+  const char *p = vim_strchr(s, '.');
   if (p != NULL) {
     if (p > s + 2 && !STRNICMP(p + 1, "EUC", 3)
         && !isalnum((int)p[4]) && p[4] != '-' && p[-3] == '_') {
@@ -2297,7 +2295,6 @@ enc_locale_copy_enc:
 
 #if defined(HAVE_ICONV)
 
-
 /*
  * Call iconv_open() with a check if iconv() works properly (there are broken
  * versions).
@@ -2404,7 +2401,7 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen
       // conversion from 'encoding' to something else.  In other
       // situations we don't know what to skip anyway.
       *to++ = '?';
-      if (utf_ptr2cells((char_u *)from) > 1) {
+      if (utf_ptr2cells(from) > 1) {
         *to++ = '?';
       }
       l = utfc_ptr2len_len((const char_u *)from, (int)fromlen);
@@ -2427,7 +2424,6 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen
 
 #endif  // HAVE_ICONV
 
-
 /*
  * Setup "vcp" for conversion from "from" to "to".
  * The names must have been made canonical with enc_canonize().
@@ -2589,7 +2585,7 @@ char_u *string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp
       case 0xbe:
         c = 0x0178; break;                 // Y
       }
-      d += utf_char2bytes(c, d);
+      d += utf_char2bytes(c, (char *)d);
     }
     *d = NUL;
     if (lenp != NULL) {
@@ -2620,7 +2616,7 @@ char_u *string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp
         }
         *d++ = ptr[i];
       } else {
-        c = utf_ptr2char(ptr + i);
+        c = utf_ptr2char((char *)ptr + i);
         if (vcp->vc_type == CONV_TO_LATIN9) {
           switch (c) {
           case 0x20ac: