1 files changed, 59 insertions, 31 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 62cc3b56ed..12460646ed 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -346,7 +346,6 @@ static int enc_canon_search(const char_u *name)
 }
 
 
-
 /*
  * Find canonical encoding "name" in the list and return its properties.
  * Returns 0 if not found.
@@ -565,7 +564,7 @@ size_t mb_string2cells(const char_u *str)
 {
   size_t clen = 0;
 
-  for (const char_u *p = str; *p != NUL; p += (*mb_ptr2len)(p)) {
+  for (const char_u *p = str; *p != NUL; p += utfc_ptr2len(p)) {
     clen += utf_ptr2cells(p);
   }
 
@@ -675,16 +674,16 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n)
   }
 
   if (k <= *n) {
-    /* We have a multibyte sequence and it isn't truncated by buffer
-     * limits so utf_ptr2char() is safe to use. Or the first byte is
-     * illegal (k=0), and it's also safe to use utf_ptr2char(). */
+    // We have a multibyte sequence and it isn't truncated by buffer
+    // limits so utf_ptr2char() is safe to use. Or the first byte is
+    // illegal (k=0), and it's also safe to use utf_ptr2char().
     c = utf_ptr2char(*s);
 
-    /* On failure, utf_ptr2char() returns the first byte, so here we
-     * check equality with the first byte. The only non-ASCII character
-     * which equals the first byte of its own UTF-8 representation is
-     * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
-     * It's safe even if n=1, else we would have k=2 > n. */
+    // On failure, utf_ptr2char() returns the first byte, so here we
+    // check equality with the first byte. The only non-ASCII character
+    // which equals the first byte of its own UTF-8 representation is
+    // U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
+    // It's safe even if n=1, else we would have k=2 > n.
     if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
       // byte sequence was successfully decoded
       *s += k;
@@ -706,7 +705,7 @@ int mb_ptr2char_adv(const char_u **const pp)
   int c;
 
   c = utf_ptr2char(*pp);
-  *pp += (*mb_ptr2len)(*pp);
+  *pp += utfc_ptr2len(*pp);
   return c;
 }
 
@@ -763,7 +762,7 @@ int utfc_ptr2char(const char_u *p, int *pcc)
   // Only accept a composing char when the first char isn't illegal.
   if ((len > 1 || *p < 0x80)
       && p[len] >= 0x80
-      && UTF_COMPOSINGLIKE(p, p + len)) {
+      && utf_composinglike(p, p + len)) {
     cc = utf_ptr2char(p + len);
     for (;; ) {
       pcc[i++] = cc;
@@ -792,9 +791,6 @@ int utfc_ptr2char(const char_u *p, int *pcc)
  */
 int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
 {
-#define IS_COMPOSING(s1, s2, s3) \
-  (i == 0 ? UTF_COMPOSINGLIKE((s1), (s2)) : utf_iscomposing((s3)))
-
   assert(maxlen > 0);
 
   int i = 0;
@@ -810,7 +806,7 @@ int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
       int len_cc = utf_ptr2len_len(p + len, maxlen - len);
       safe = len_cc > 1 && len_cc <= maxlen - len;
       if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
-          || !IS_COMPOSING(p, p + len, pcc[i])) {
+          || !(i == 0 ? utf_composinglike(p, p+len) : utf_iscomposing(pcc[i]))) {
         break;
       }
       len += len_cc;
@@ -915,7 +911,7 @@ int utfc_ptr2len(const char_u *const p)
   // skip all of them (otherwise the cursor would get stuck).
   int prevlen = 0;
   for (;;) {
-    if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) {
+    if (p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
       return len;
     }
 
@@ -965,14 +961,14 @@ int utfc_ptr2len_len(const char_u *p, int size)
 
     /*
      * Next character length should not go beyond size to ensure that
-     * UTF_COMPOSINGLIKE(...) does not read beyond size.
+     * utf_composinglike(...) does not read beyond size.
      */
     len_next_char = utf_ptr2len_len(p + len, size - len);
     if (len_next_char > size - len) {
       break;
     }
 
-    if (!UTF_COMPOSINGLIKE(p + prevlen, p + len)) {
+    if (!utf_composinglike(p + prevlen, p + len)) {
       break;
     }
 
@@ -1582,12 +1578,12 @@ void show_utf8(void)
   int clen;
   int i;
 
-  /* Get the byte length of the char under the cursor, including composing
-   * characters. */
+  // Get the byte length of the char under the cursor, including composing
+  // characters.
   line = get_cursor_pos_ptr();
   len = utfc_ptr2len(line);
   if (len == 0) {
-    MSG("NUL");
+    msg("NUL");
     return;
   }
 
@@ -1610,7 +1606,7 @@ void show_utf8(void)
     }
   }
 
-  msg(IObuff);
+  msg((char *)IObuff);
 }
 
 /// Return offset from "p" to the first byte of the character it points into.
@@ -1625,8 +1621,8 @@ int utf_head_off(const char_u *base, const char_u *p)
     return 0;
   }
 
-  /* Skip backwards over trailing bytes: 10xx.xxxx
-   * Skip backwards again if on a composing char. */
+  // Skip backwards over trailing bytes: 10xx.xxxx
+  // Skip backwards again if on a composing char.
   const char_u *q;
   for (q = p;; --q) {
     // Move s to the last byte of this char.
@@ -1883,6 +1879,40 @@ int mb_tail_off(char_u *base, char_u *p)
   return i;
 }
 
+
+/// Return the offset from "p" to the first byte of the character it points
+/// into. Can start anywhere in a stream of bytes.
+///
+/// @param[in] base  Pointer to start of string
+/// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
+//
+/// @return 0 if invalid sequence, else offset to previous codepoint
+int mb_head_off(char_u *base, char_u *p)
+{
+  int i;
+  int j;
+
+  if (*p == NUL) {
+    return 0;
+  }
+
+  // Find the first character that is not 10xx.xxxx
+  for (i = 0; p - i > base; i--) {
+    if ((p[i] & 0xc0) != 0x80) {
+      break;
+    }
+  }
+
+  // Find the last character that is 10xx.xxxx
+  for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {}
+
+  // Check for illegal sequence.
+  if (utf8len_tab[p[i]] == 1) {
+    return 0;
+  }
+  return i;
+}
+
 /*
  * Find the next illegal byte sequence.
  */
@@ -1915,8 +1945,8 @@ void utf_find_illegal(void)
     }
 
     while (*p != NUL) {
-      /* Illegal means that there are not enough trail bytes (checked by
-       * utf_ptr2len()) or too many of them (overlong sequence). */
+      // Illegal means that there are not enough trail bytes (checked by
+      // utf_ptr2len()) or too many of them (overlong sequence).
       len = utf_ptr2len(p);
       if (*p >= 0x80 && (len == 1
                          || utf_char2len(utf_ptr2char(p)) != len)) {
@@ -2021,7 +2051,7 @@ int mb_charlen(char_u *str)
   }
 
   for (count = 0; *p != NUL; count++) {
-    p += (*mb_ptr2len)(p);
+    p += utfc_ptr2len(p);
   }
 
   return count;
@@ -2036,7 +2066,7 @@ int mb_charlen_len(char_u *str, int len)
   int count;
 
   for (count = 0; *p != NUL && p < str + len; count++) {
-    p += (*mb_ptr2len)(p);
+    p += utfc_ptr2len(p);
   }
 
   return count;
@@ -2398,8 +2428,6 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen
 #endif  // HAVE_ICONV
 
 
-
-
 /*
  * Setup "vcp" for conversion from "from" to "to".
  * The names must have been made canonical with enc_canonize().