1 files changed, 265 insertions, 141 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index f2883cc5c7..c7a56209e4 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -32,7 +32,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <sys/types.h>
+#include <uv.h>
 #include <wctype.h>
 
 #include "auto/config.h"
@@ -46,9 +46,10 @@
 #include "nvim/eval/typval.h"
 #include "nvim/eval/typval_defs.h"
 #include "nvim/getchar.h"
-#include "nvim/gettext.h"
+#include "nvim/gettext_defs.h"
 #include "nvim/globals.h"
 #include "nvim/grid.h"
+#include "nvim/grid_defs.h"
 #include "nvim/iconv_defs.h"
 #include "nvim/keycodes.h"
 #include "nvim/macros_defs.h"
@@ -444,24 +445,26 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
 static bool intable(const struct interval *table, size_t n_items, int c)
   FUNC_ATTR_PURE
 {
+  assert(n_items > 0);
   // first quick check for Latin1 etc. characters
   if (c < table[0].first) {
     return false;
   }
 
+  assert(n_items <= SIZE_MAX / 2);
   // binary search in table
-  int bot = 0;
-  int top = (int)(n_items - 1);
-  while (top >= bot) {
-    int mid = (bot + top) / 2;
+  size_t bot = 0;
+  size_t top = n_items;
+  do {
+    size_t mid = (bot + top) >> 1;
     if (table[mid].last < c) {
       bot = mid + 1;
     } else if (table[mid].first > c) {
-      top = mid - 1;
+      top = mid;
     } else {
       return true;
     }
-  }
+  } while (top > bot);
   return false;
 }
 
@@ -475,32 +478,28 @@ static bool intable(const struct interval *table, size_t n_items, int c)
 ///       gen_unicode_tables.lua, which must be manually invoked as needed.
 int utf_char2cells(int c)
 {
-  // Use the value from setcellwidths() at 0x80 and higher, unless the
-  // character is not printable.
-  if (c >= 0x80 && vim_isprintc(c)) {
-    int n = cw_value(c);
-    if (n != 0) {
-      return n;
-    }
+  if (c < 0x80) {
+    return 1;
   }
 
-  if (c >= 0x100) {
-    if (!utf_printable(c)) {
-      return 6;                 // unprintable, displays <xxxx>
-    }
-    if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
-      return 2;
-    }
-    if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
-      return 2;
-    }
-  } else if (c >= 0x80 && !vim_isprintc(c)) {
-    // Characters below 0x100 are influenced by 'isprint' option.
-    return 4;                   // unprintable, displays <xx>
+  if (!vim_isprintc(c)) {
+    assert(c <= 0xFFFF);
+    // unprintable is displayed either as <xx> or <xxxx>
+    return c > 0xFF ? 6 : 4;
+  }
+
+  int n = cw_value(c);
+  if (n != 0) {
+    return n;
   }
 
-  if (c >= 0x80 && *p_ambw == 'd'
-      && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
+  if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
+    return 2;
+  }
+  if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
+    return 2;
+  }
+  if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
     return 2;
   }
 
@@ -527,6 +526,74 @@ int utf_ptr2cells(const char *p)
   return 1;
 }
 
+/// Convert a UTF-8 byte sequence to a character number.
+/// Doesn't handle ascii! only multibyte and illegal sequences.
+///
+/// @param[in]  p      String to convert.
+/// @param[in]  len    Length of the character in bytes, 0 or 1 if illegal.
+///
+/// @return Unicode codepoint. A negative value when the sequence is illegal.
+int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
+  FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
+{
+// uint8_t is a reminder for clang to use smaller cmp
+#define CHECK \
+  do { \
+    if (EXPECT((uint8_t)(cur & 0xC0U) != 0x80U, false)) { \
+      return -1; \
+    } \
+  } while (0)
+
+  static uint32_t const corrections[] = {
+    (1U << 31),  // invalid - set invalid bits (safe to add as first 2 bytes
+    (1U << 31),  // won't affect highest bit in normal ret)
+    -(0x80U + (0xC0U << 6)),  // multibyte - subtract added UTF8 bits (1..10xxx and 10xxx)
+    -(0x80U + (0x80U << 6) + (0xE0U << 12)),
+    -(0x80U + (0x80U << 6) + (0x80U << 12) + (0xF0U << 18)),
+    -(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0xF8U << 24)),
+    -(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0x80U << 24)),  // + (0xFCU << 30)
+  };
+
+  // len is 0-6, but declared uintptr_t to avoid zeroing out upper bits
+  uint32_t const corr = corrections[len];
+  uint8_t cur;
+
+  // reading second byte unconditionally, safe for invalid
+  // as it cannot be the last byte, not safe for ascii
+  uint32_t code_point = ((uint32_t)p[0] << 6) + (cur = p[1]);
+  CHECK;
+  if ((uint32_t)len < 3) {
+    goto ret;  // len == 0, 1, 2
+  }
+
+  code_point = (code_point << 6) + (cur = p[2]);
+  CHECK;
+  if ((uint32_t)len == 3) {
+    goto ret;
+  }
+
+  code_point = (code_point << 6) + (cur = p[3]);
+  CHECK;
+  if ((uint32_t)len == 4) {
+    goto ret;
+  }
+
+  code_point = (code_point << 6) + (cur = p[4]);
+  CHECK;
+  if ((uint32_t)len == 5) {
+    goto ret;
+  }
+
+  code_point = (code_point << 6) + (cur = p[5]);
+  CHECK;
+  // len == 6
+
+ret:
+  return (int32_t)(code_point + corr);
+
+#undef CHECK
+}
+
 /// Like utf_ptr2cells(), but limit string length to "size".
 /// For an empty string or truncated character returns 1.
 int utf_ptr2cells_len(const char *p, int size)
@@ -596,45 +663,62 @@ size_t mb_string2cells_len(const char *str, size_t size)
 ///
 /// @return Unicode codepoint or byte value.
 int utf_ptr2char(const char *const p_in)
-  FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
+  FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
 {
   uint8_t *p = (uint8_t *)p_in;
-  if (p[0] < 0x80) {  // Be quick for ASCII.
-    return p[0];
+
+  uint32_t const v0 = p[0];
+  if (EXPECT(v0 < 0x80U, true)) {  // Be quick for ASCII.
+    return (int)v0;
   }
 
-  const uint8_t len = utf8len_tab_zero[p[0]];
-  if (len > 1 && (p[1] & 0xc0) == 0x80) {
-    if (len == 2) {
-      return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
-    }
-    if ((p[2] & 0xc0) == 0x80) {
-      if (len == 3) {
-        return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
-                + (p[2] & 0x3f));
-      }
-      if ((p[3] & 0xc0) == 0x80) {
-        if (len == 4) {
-          return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
-                  + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
-        }
-        if ((p[4] & 0xc0) == 0x80) {
-          if (len == 5) {
-            return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
-                    + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
-                    + (p[4] & 0x3f));
-          }
-          if ((p[5] & 0xc0) == 0x80 && len == 6) {
-            return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
-                    + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
-                    + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
-          }
-        }
-      }
-    }
+  const uint8_t len = utf8len_tab[v0];
+  if (EXPECT(len < 2, false)) {
+    return (int)v0;
   }
-  // Illegal value: just return the first byte.
-  return p[0];
+
+#define CHECK(v) \
+  do { \
+    if (EXPECT((uint8_t)((v) & 0xC0U) != 0x80U, false)) { \
+      return (int)v0; \
+    } \
+  } while (0)
+#define LEN_RETURN(len_v, result) \
+  do { \
+    if (len == (len_v)) { \
+      return (int)(result); \
+    } \
+  } while (0)
+#define S(s) ((uint32_t)0x80U << (s))
+
+  uint32_t const v1 = p[1];
+  CHECK(v1);
+  LEN_RETURN(2, (v0 << 6) + v1 - ((0xC0U << 6) + S(0)));
+
+  uint32_t const v2 = p[2];
+  CHECK(v2);
+  LEN_RETURN(3, (v0 << 12) + (v1 << 6) + v2 - ((0xE0U << 12) + S(6) + S(0)));
+
+  uint32_t const v3 = p[3];
+  CHECK(v3);
+  LEN_RETURN(4, (v0 << 18) + (v1 << 12) + (v2 << 6) + v3
+             - ((0xF0U << 18) + S(12) + S(6) + S(0)));
+
+  uint32_t const v4 = p[4];
+  CHECK(v4);
+  LEN_RETURN(5, (v0 << 24) + (v1 << 18) + (v2 << 12) + (v3 << 6) + v4
+             - ((0xF8U << 24) + S(18) + S(12) + S(6) + S(0)));
+
+  uint32_t const v5 = p[5];
+  CHECK(v5);
+  // len == 6
+  return (int)((v0 << 30) + (v1 << 24) + (v2 << 18) + (v3 << 12) + (v4 << 6) + v5
+               // - (0xFCU << 30)
+               - (S(24) + S(18) + S(12) + S(6) + S(0)));
+
+#undef S
+#undef CHECK
+#undef LEN_RETURN
 }
 
 // Convert a UTF-8 byte sequence to a wide character.
@@ -721,6 +805,16 @@ bool utf_composinglike(const char *p1, const char *p2)
   return arabic_combine(utf_ptr2char(p1), c2);
 }
 
+/// Check if the next character is a composing character when it
+/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
+/// behaves like a composing character.
+/// returns false for negative values
+bool utf_char_composinglike(int32_t const first, int32_t const next)
+  FUNC_ATTR_PURE
+{
+  return utf_iscomposing(next) || arabic_combine(first, next);
+}
+
 /// Get the screen char at the beginning of a string
 ///
 /// Caller is expected to check for things like unprintable chars etc
@@ -987,17 +1081,61 @@ int utf_char2bytes(const int c, char *const buf)
   }
 }
 
-// Return true if "c" is a composing UTF-8 character.  This means it will be
-// drawn on top of the preceding character.
-// Based on code from Markus Kuhn.
+/// Return true if "c" is a composing UTF-8 character.
+/// This means it will be drawn on top of the preceding character.
+/// Based on code from Markus Kuhn.
+/// Returns false for negative values.
 bool utf_iscomposing(int c)
 {
   return intable(combining, ARRAY_SIZE(combining), c);
 }
 
+#ifdef __SSE2__
+
+# include <emmintrin.h>
+
+// Return true for characters that can be displayed in a normal way.
+// Only for characters of 0x100 and above!
+bool utf_printable(int c)
+  FUNC_ATTR_CONST
+{
+  if (c < 0x180B || c > 0xFFFF) {
+    return c != 0x70F;
+  }
+
+# define L(v) ((int16_t)((v) - 1))  // lower bound (exclusive)
+# define H(v) ((int16_t)(v))  // upper bound (inclusive)
+
+  // Boundaries of unprintable characters.
+  // Some values are negative when converted to int16_t.
+  // Ranges must not wrap around when converted to int16_t.
+  __m128i const lo = _mm_setr_epi16(L(0x180b), L(0x200b), L(0x202a), L(0x2060),
+                                    L(0xd800), L(0xfeff), L(0xfff9), L(0xfffe));
+
+  __m128i const hi = _mm_setr_epi16(H(0x180e), H(0x200f), H(0x202e), H(0x206f),
+                                    H(0xdfff), H(0xfeff), H(0xfffb), H(0xffff));
+
+# undef L
+# undef H
+
+  __m128i value = _mm_set1_epi16((int16_t)c);
+
+  // Using _mm_cmplt_epi16() is less optimal, since it would require
+  // swapping operands (sse2 only has cmpgt instruction),
+  // and only the second operand can be a memory location.
+
+  // Character is printable when it is above/below both bounds of each range
+  // (corresponding bits in both masks are equal).
+  return _mm_movemask_epi8(_mm_cmpgt_epi16(value, lo))
+         == _mm_movemask_epi8(_mm_cmpgt_epi16(value, hi));
+}
+
+#else
+
 // Return true for characters that can be displayed in a normal way.
 // Only for characters of 0x100 and above!
 bool utf_printable(int c)
+  FUNC_ATTR_PURE
 {
   // Sorted list of non-overlapping intervals.
   // 0xd800-0xdfff is reserved for UTF-16, actually illegal.
@@ -1010,6 +1148,8 @@ bool utf_printable(int c)
   return !intable(nonprint, ARRAY_SIZE(nonprint), c);
 }
 
+#endif
+
 // Get class of a Unicode character.
 // 0: white space
 // 1: punctuation
@@ -1183,6 +1323,9 @@ int utf_fold(int a)
 // invalid values or can't handle latin1 when the locale is C.
 // Speed is most important here.
 
+// Note: UnicodeData.txt does not define U+1E9E as being the corresponding upper
+// case letter for U+00DF (ß), however it is part of the toLower table
+
 /// Return the upper-case equivalent of "a", which is a UCS-4 character.  Use
 /// simple case folding.
 int mb_toupper(int a)
@@ -1422,7 +1565,8 @@ int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
 void mb_utflen(const char *s, size_t len, size_t *codepoints, size_t *codeunits)
   FUNC_ATTR_NONNULL_ALL
 {
-  size_t count = 0, extra = 0;
+  size_t count = 0;
+  size_t extra = 0;
   size_t clen;
   for (size_t i = 0; i < len; i += clen) {
     clen = (size_t)utf_ptr2len_len(s + i, (int)(len - i));
@@ -1740,99 +1884,66 @@ void mb_copy_char(const char **const fp, char **const tp)
   *fp += l;
 }
 
-/// Return the offset from "p_in" to the first byte of a character.  When "p_in" is
+/// Return the offset from "p" to the first byte of a character.  When "p" is
 /// at the start of a character 0 is returned, otherwise the offset to the next
 /// character.  Can start anywhere in a stream of bytes.
-int mb_off_next(const char *base, const char *p_in)
+int mb_off_next(const char *base, const char *p)
 {
-  const uint8_t *p = (uint8_t *)p_in;
-  int i;
+  int head_off = utf_head_off(base, p);
 
-  if (*p < 0x80) {              // be quick for ASCII
+  if (head_off == 0) {
     return 0;
   }
 
-  // Find the next character that isn't 10xx.xxxx
-  for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
-  if (i > 0) {
-    int j;
-    // Check for illegal sequence.
-    for (j = 0; p - j > (uint8_t *)base; j++) {
-      if ((p[-j] & 0xc0) != 0x80) {
-        break;
-      }
-    }
-    if (utf8len_tab[p[-j]] != i + j) {
-      return 0;
-    }
-  }
-  return i;
+  return utfc_ptr2len(p - head_off) - head_off;
 }
 
-/// Return the offset from `p_in` to the last byte of the codepoint it points
-/// to.  Can start anywhere in a stream of bytes.
+/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
+/// of the codepoint it points to.
+/// "p_in" can point anywhere in a stream of bytes.
+/// "p_len" limits number of bytes after "p_in".
 /// Note: Counts individual codepoints of composed characters separately.
-int utf_cp_tail_off(const char *base, const char *p_in)
+CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len)
+  FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
 {
-  const uint8_t *p = (uint8_t *)p_in;
-  int i;
-  int j;
-
-  if (*p == NUL) {
-    return 0;
+  assert(base <= p_in && p_len > 0);
+  uint8_t const *const b = (uint8_t *)base;
+  uint8_t const *const p = (uint8_t *)p_in;
+  if (*p < 0x80U) {  // be quick for ASCII
+    return (CharBoundsOff){ 0, 1 };
   }
 
-  // Find the last character that is 10xx.xxxx
-  for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
-
-  // Check for illegal sequence.
-  for (j = 0; p_in - j > base; j++) {
-    if ((p[-j] & 0xc0) != 0x80) {
-      break;
+  int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1);
+  int first_off = 0;
+  for (; utf_is_trail_byte(p[first_off]); first_off--) {
+    if (first_off == max_first_off) {  // failed to find first byte
+      return (CharBoundsOff){ 0, 1 };
     }
   }
 
-  if (utf8len_tab[p[-j]] != i + j + 1) {
-    return 0;
+  int const max_end_off = utf8len_tab[p[first_off]] + first_off;
+  if (max_end_off <= 0 || max_end_off > p_len) {  // illegal or incomplete sequence
+    return (CharBoundsOff){ 0, 1 };
   }
-  return i;
-}
 
-/// Return the offset from "p" to the first byte of the codepoint it points
-/// to. Can start anywhere in a stream of bytes.
-/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately.
-///
-/// @param[in] base  Pointer to start of string
-/// @param[in] p     Pointer to byte for which to return the offset to the previous codepoint
-//
-/// @return 0 if invalid sequence, else number of bytes to previous codepoint
-int utf_cp_head_off(const char *base, const char *p)
-{
-  int i;
-
-  if (*p == NUL) {
-    return 0;
-  }
-
-  // Find the first character that is not 10xx.xxxx
-  for (i = 0; p - i >= base; i++) {
-    if (((uint8_t)p[-i] & 0xc0) != 0x80) {
-      break;
+  for (int end_off = 1; end_off < max_end_off; end_off++) {
+    if (!utf_is_trail_byte(p[end_off])) {  // not enough trail bytes
+      return (CharBoundsOff){ 0, 1 };
     }
   }
 
-  // Find the last character that is 10xx.xxxx (condition terminates on NUL)
-  int j = 1;
-  while (((uint8_t)p[j] & 0xc0) == 0x80) {
-    j++;
-  }
+  return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off };
+}
 
-  // Check for illegal sequence.
-  if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
-    return 0;
-  }
-  return i;
+/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
+/// of the codepoint it points to.
+/// "p_in" can point anywhere in a stream of bytes.
+/// Stream must be NUL-terminated.
+/// Note: Counts individual codepoints of composed characters separately.
+CharBoundsOff utf_cp_bounds(char const *base, char const *p_in)
+  FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
+{
+  return utf_cp_bounds_len(base, p_in, INT_MAX);
 }
 
 // Find the next illegal byte sequence.
@@ -2250,7 +2361,7 @@ void *my_iconv_open(char *to, char *from)
     // stops for no apparent reason after about 8160 characters.
     char *p = tobuf;
     size_t tolen = ICONV_TESTLEN;
-    (void)iconv(fd, NULL, NULL, &p, &tolen);
+    iconv(fd, NULL, NULL, &p, &tolen);
     if (p == NULL) {
       iconv_working = kBroken;
       iconv_close(fd);
@@ -2651,8 +2762,10 @@ static int tv_nr_compare(const void *a1, const void *a2)
 {
   const listitem_T *const li1 = tv_list_first(*(const list_T **)a1);
   const listitem_T *const li2 = tv_list_first(*(const list_T **)a2);
+  const varnumber_T n1 = TV_LIST_ITEM_TV(li1)->vval.v_number;
+  const varnumber_T n2 = TV_LIST_ITEM_TV(li2)->vval.v_number;
 
-  return (int)(TV_LIST_ITEM_TV(li1)->vval.v_number - TV_LIST_ITEM_TV(li2)->vval.v_number);
+  return n1 == n2 ? 0 : n1 > n2 ? 1 : -1;
 }
 
 /// "setcellwidths()" function
@@ -2802,3 +2915,14 @@ char *get_encoding_name(expand_T *xp FUNC_ATTR_UNUSED, int idx)
 
   return (char *)enc_canon_table[idx].name;
 }
+
+/// Compare strings
+///
+/// @param[in]  ic  True if case is to be ignored.
+///
+/// @return 0 if s1 == s2, <0 if s1 < s2, >0 if s1 > s2.
+int mb_strcmp_ic(bool ic, const char *s1, const char *s2)
+  FUNC_ATTR_NONNULL_ALL FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
+{
+  return (ic ? mb_stricmp(s1, s2) : strcmp(s1, s2));
+}