aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c406
1 files changed, 265 insertions, 141 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index f2883cc5c7..c7a56209e4 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -32,7 +32,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <sys/types.h>
+#include <uv.h>
#include <wctype.h>
#include "auto/config.h"
@@ -46,9 +46,10 @@
#include "nvim/eval/typval.h"
#include "nvim/eval/typval_defs.h"
#include "nvim/getchar.h"
-#include "nvim/gettext.h"
+#include "nvim/gettext_defs.h"
#include "nvim/globals.h"
#include "nvim/grid.h"
+#include "nvim/grid_defs.h"
#include "nvim/iconv_defs.h"
#include "nvim/keycodes.h"
#include "nvim/macros_defs.h"
@@ -444,24 +445,26 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
static bool intable(const struct interval *table, size_t n_items, int c)
FUNC_ATTR_PURE
{
+ assert(n_items > 0);
// first quick check for Latin1 etc. characters
if (c < table[0].first) {
return false;
}
+ assert(n_items <= SIZE_MAX / 2);
// binary search in table
- int bot = 0;
- int top = (int)(n_items - 1);
- while (top >= bot) {
- int mid = (bot + top) / 2;
+ size_t bot = 0;
+ size_t top = n_items;
+ do {
+ size_t mid = (bot + top) >> 1;
if (table[mid].last < c) {
bot = mid + 1;
} else if (table[mid].first > c) {
- top = mid - 1;
+ top = mid;
} else {
return true;
}
- }
+ } while (top > bot);
return false;
}
@@ -475,32 +478,28 @@ static bool intable(const struct interval *table, size_t n_items, int c)
/// gen_unicode_tables.lua, which must be manually invoked as needed.
int utf_char2cells(int c)
{
- // Use the value from setcellwidths() at 0x80 and higher, unless the
- // character is not printable.
- if (c >= 0x80 && vim_isprintc(c)) {
- int n = cw_value(c);
- if (n != 0) {
- return n;
- }
+ if (c < 0x80) {
+ return 1;
}
- if (c >= 0x100) {
- if (!utf_printable(c)) {
- return 6; // unprintable, displays <xxxx>
- }
- if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
- return 2;
- }
- if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
- return 2;
- }
- } else if (c >= 0x80 && !vim_isprintc(c)) {
- // Characters below 0x100 are influenced by 'isprint' option.
- return 4; // unprintable, displays <xx>
+ if (!vim_isprintc(c)) {
+ assert(c <= 0xFFFF);
+ // unprintable is displayed either as <xx> or <xxxx>
+ return c > 0xFF ? 6 : 4;
+ }
+
+ int n = cw_value(c);
+ if (n != 0) {
+ return n;
}
- if (c >= 0x80 && *p_ambw == 'd'
- && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
+ if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
+ return 2;
+ }
+ if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
+ return 2;
+ }
+ if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
return 2;
}
@@ -527,6 +526,74 @@ int utf_ptr2cells(const char *p)
return 1;
}
+/// Convert a UTF-8 byte sequence to a character number.
+/// Doesn't handle ascii! only multibyte and illegal sequences.
+///
+/// @param[in] p String to convert.
+/// @param[in] len Length of the character in bytes, 0 or 1 if illegal.
+///
+/// @return Unicode codepoint. A negative value when the sequence is illegal.
+int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
+ FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
+{
+// uint8_t is a reminder for clang to use smaller cmp
+#define CHECK \
+ do { \
+ if (EXPECT((uint8_t)(cur & 0xC0U) != 0x80U, false)) { \
+ return -1; \
+ } \
+ } while (0)
+
+ static uint32_t const corrections[] = {
+ (1U << 31), // invalid - set invalid bits (safe to add as first 2 bytes
+ (1U << 31), // won't affect highest bit in normal ret)
+ -(0x80U + (0xC0U << 6)), // multibyte - subtract added UTF8 bits (1..10xxx and 10xxx)
+ -(0x80U + (0x80U << 6) + (0xE0U << 12)),
+ -(0x80U + (0x80U << 6) + (0x80U << 12) + (0xF0U << 18)),
+ -(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0xF8U << 24)),
+ -(0x80U + (0x80U << 6) + (0x80U << 12) + (0x80U << 18) + (0x80U << 24)), // + (0xFCU << 30)
+ };
+
+ // len is 0-6, but declared uintptr_t to avoid zeroing out upper bits
+ uint32_t const corr = corrections[len];
+ uint8_t cur;
+
+ // reading second byte unconditionally, safe for invalid
+ // as it cannot be the last byte, not safe for ascii
+ uint32_t code_point = ((uint32_t)p[0] << 6) + (cur = p[1]);
+ CHECK;
+ if ((uint32_t)len < 3) {
+ goto ret; // len == 0, 1, 2
+ }
+
+ code_point = (code_point << 6) + (cur = p[2]);
+ CHECK;
+ if ((uint32_t)len == 3) {
+ goto ret;
+ }
+
+ code_point = (code_point << 6) + (cur = p[3]);
+ CHECK;
+ if ((uint32_t)len == 4) {
+ goto ret;
+ }
+
+ code_point = (code_point << 6) + (cur = p[4]);
+ CHECK;
+ if ((uint32_t)len == 5) {
+ goto ret;
+ }
+
+ code_point = (code_point << 6) + (cur = p[5]);
+ CHECK;
+ // len == 6
+
+ret:
+ return (int32_t)(code_point + corr);
+
+#undef CHECK
+}
+
/// Like utf_ptr2cells(), but limit string length to "size".
/// For an empty string or truncated character returns 1.
int utf_ptr2cells_len(const char *p, int size)
@@ -596,45 +663,62 @@ size_t mb_string2cells_len(const char *str, size_t size)
///
/// @return Unicode codepoint or byte value.
int utf_ptr2char(const char *const p_in)
- FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
+ FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
{
uint8_t *p = (uint8_t *)p_in;
- if (p[0] < 0x80) { // Be quick for ASCII.
- return p[0];
+
+ uint32_t const v0 = p[0];
+ if (EXPECT(v0 < 0x80U, true)) { // Be quick for ASCII.
+ return (int)v0;
}
- const uint8_t len = utf8len_tab_zero[p[0]];
- if (len > 1 && (p[1] & 0xc0) == 0x80) {
- if (len == 2) {
- return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
- }
- if ((p[2] & 0xc0) == 0x80) {
- if (len == 3) {
- return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
- + (p[2] & 0x3f));
- }
- if ((p[3] & 0xc0) == 0x80) {
- if (len == 4) {
- return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
- + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
- }
- if ((p[4] & 0xc0) == 0x80) {
- if (len == 5) {
- return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
- + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
- + (p[4] & 0x3f));
- }
- if ((p[5] & 0xc0) == 0x80 && len == 6) {
- return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
- + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
- + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
- }
- }
- }
- }
+ const uint8_t len = utf8len_tab[v0];
+ if (EXPECT(len < 2, false)) {
+ return (int)v0;
}
- // Illegal value: just return the first byte.
- return p[0];
+
+#define CHECK(v) \
+ do { \
+ if (EXPECT((uint8_t)((v) & 0xC0U) != 0x80U, false)) { \
+ return (int)v0; \
+ } \
+ } while (0)
+#define LEN_RETURN(len_v, result) \
+ do { \
+ if (len == (len_v)) { \
+ return (int)(result); \
+ } \
+ } while (0)
+#define S(s) ((uint32_t)0x80U << (s))
+
+ uint32_t const v1 = p[1];
+ CHECK(v1);
+ LEN_RETURN(2, (v0 << 6) + v1 - ((0xC0U << 6) + S(0)));
+
+ uint32_t const v2 = p[2];
+ CHECK(v2);
+ LEN_RETURN(3, (v0 << 12) + (v1 << 6) + v2 - ((0xE0U << 12) + S(6) + S(0)));
+
+ uint32_t const v3 = p[3];
+ CHECK(v3);
+ LEN_RETURN(4, (v0 << 18) + (v1 << 12) + (v2 << 6) + v3
+ - ((0xF0U << 18) + S(12) + S(6) + S(0)));
+
+ uint32_t const v4 = p[4];
+ CHECK(v4);
+ LEN_RETURN(5, (v0 << 24) + (v1 << 18) + (v2 << 12) + (v3 << 6) + v4
+ - ((0xF8U << 24) + S(18) + S(12) + S(6) + S(0)));
+
+ uint32_t const v5 = p[5];
+ CHECK(v5);
+ // len == 6
+ return (int)((v0 << 30) + (v1 << 24) + (v2 << 18) + (v3 << 12) + (v4 << 6) + v5
+ // - (0xFCU << 30)
+ - (S(24) + S(18) + S(12) + S(6) + S(0)));
+
+#undef S
+#undef CHECK
+#undef LEN_RETURN
}
// Convert a UTF-8 byte sequence to a wide character.
@@ -721,6 +805,16 @@ bool utf_composinglike(const char *p1, const char *p2)
return arabic_combine(utf_ptr2char(p1), c2);
}
+/// Check if the next character is a composing character when it
+/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
+/// behaves like a composing character.
+/// returns false for negative values
+bool utf_char_composinglike(int32_t const first, int32_t const next)
+ FUNC_ATTR_PURE
+{
+ return utf_iscomposing(next) || arabic_combine(first, next);
+}
+
/// Get the screen char at the beginning of a string
///
/// Caller is expected to check for things like unprintable chars etc
@@ -987,17 +1081,61 @@ int utf_char2bytes(const int c, char *const buf)
}
}
-// Return true if "c" is a composing UTF-8 character. This means it will be
-// drawn on top of the preceding character.
-// Based on code from Markus Kuhn.
+/// Return true if "c" is a composing UTF-8 character.
+/// This means it will be drawn on top of the preceding character.
+/// Based on code from Markus Kuhn.
+/// Returns false for negative values.
bool utf_iscomposing(int c)
{
return intable(combining, ARRAY_SIZE(combining), c);
}
+#ifdef __SSE2__
+
+# include <emmintrin.h>
+
+// Return true for characters that can be displayed in a normal way.
+// Only for characters of 0x100 and above!
+bool utf_printable(int c)
+ FUNC_ATTR_CONST
+{
+ if (c < 0x180B || c > 0xFFFF) {
+ return c != 0x70F;
+ }
+
+# define L(v) ((int16_t)((v) - 1)) // lower bound (exclusive)
+# define H(v) ((int16_t)(v)) // upper bound (inclusive)
+
+ // Boundaries of unprintable characters.
+ // Some values are negative when converted to int16_t.
+ // Ranges must not wrap around when converted to int16_t.
+ __m128i const lo = _mm_setr_epi16(L(0x180b), L(0x200b), L(0x202a), L(0x2060),
+ L(0xd800), L(0xfeff), L(0xfff9), L(0xfffe));
+
+ __m128i const hi = _mm_setr_epi16(H(0x180e), H(0x200f), H(0x202e), H(0x206f),
+ H(0xdfff), H(0xfeff), H(0xfffb), H(0xffff));
+
+# undef L
+# undef H
+
+ __m128i value = _mm_set1_epi16((int16_t)c);
+
+ // Using _mm_cmplt_epi16() is less optimal, since it would require
+ // swapping operands (sse2 only has cmpgt instruction),
+ // and only the second operand can be a memory location.
+
+ // Character is printable when it is above/below both bounds of each range
+ // (corresponding bits in both masks are equal).
+ return _mm_movemask_epi8(_mm_cmpgt_epi16(value, lo))
+ == _mm_movemask_epi8(_mm_cmpgt_epi16(value, hi));
+}
+
+#else
+
// Return true for characters that can be displayed in a normal way.
// Only for characters of 0x100 and above!
bool utf_printable(int c)
+ FUNC_ATTR_PURE
{
// Sorted list of non-overlapping intervals.
// 0xd800-0xdfff is reserved for UTF-16, actually illegal.
@@ -1010,6 +1148,8 @@ bool utf_printable(int c)
return !intable(nonprint, ARRAY_SIZE(nonprint), c);
}
+#endif
+
// Get class of a Unicode character.
// 0: white space
// 1: punctuation
@@ -1183,6 +1323,9 @@ int utf_fold(int a)
// invalid values or can't handle latin1 when the locale is C.
// Speed is most important here.
+// Note: UnicodeData.txt does not define U+1E9E as being the corresponding upper
+// case letter for U+00DF (ß), however it is part of the toLower table
+
/// Return the upper-case equivalent of "a", which is a UCS-4 character. Use
/// simple case folding.
int mb_toupper(int a)
@@ -1422,7 +1565,8 @@ int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
void mb_utflen(const char *s, size_t len, size_t *codepoints, size_t *codeunits)
FUNC_ATTR_NONNULL_ALL
{
- size_t count = 0, extra = 0;
+ size_t count = 0;
+ size_t extra = 0;
size_t clen;
for (size_t i = 0; i < len; i += clen) {
clen = (size_t)utf_ptr2len_len(s + i, (int)(len - i));
@@ -1740,99 +1884,66 @@ void mb_copy_char(const char **const fp, char **const tp)
*fp += l;
}
-/// Return the offset from "p_in" to the first byte of a character. When "p_in" is
+/// Return the offset from "p" to the first byte of a character. When "p" is
/// at the start of a character 0 is returned, otherwise the offset to the next
/// character. Can start anywhere in a stream of bytes.
-int mb_off_next(const char *base, const char *p_in)
+int mb_off_next(const char *base, const char *p)
{
- const uint8_t *p = (uint8_t *)p_in;
- int i;
+ int head_off = utf_head_off(base, p);
- if (*p < 0x80) { // be quick for ASCII
+ if (head_off == 0) {
return 0;
}
- // Find the next character that isn't 10xx.xxxx
- for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
- if (i > 0) {
- int j;
- // Check for illegal sequence.
- for (j = 0; p - j > (uint8_t *)base; j++) {
- if ((p[-j] & 0xc0) != 0x80) {
- break;
- }
- }
- if (utf8len_tab[p[-j]] != i + j) {
- return 0;
- }
- }
- return i;
+ return utfc_ptr2len(p - head_off) - head_off;
}
-/// Return the offset from `p_in` to the last byte of the codepoint it points
-/// to. Can start anywhere in a stream of bytes.
+/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
+/// of the codepoint it points to.
+/// "p_in" can point anywhere in a stream of bytes.
+/// "p_len" limits number of bytes after "p_in".
/// Note: Counts individual codepoints of composed characters separately.
-int utf_cp_tail_off(const char *base, const char *p_in)
+CharBoundsOff utf_cp_bounds_len(char const *base, char const *p_in, int p_len)
+ FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
{
- const uint8_t *p = (uint8_t *)p_in;
- int i;
- int j;
-
- if (*p == NUL) {
- return 0;
+ assert(base <= p_in && p_len > 0);
+ uint8_t const *const b = (uint8_t *)base;
+ uint8_t const *const p = (uint8_t *)p_in;
+ if (*p < 0x80U) { // be quick for ASCII
+ return (CharBoundsOff){ 0, 1 };
}
- // Find the last character that is 10xx.xxxx
- for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
-
- // Check for illegal sequence.
- for (j = 0; p_in - j > base; j++) {
- if ((p[-j] & 0xc0) != 0x80) {
- break;
+ int const max_first_off = -MIN((int)(p - b), MB_MAXCHAR - 1);
+ int first_off = 0;
+ for (; utf_is_trail_byte(p[first_off]); first_off--) {
+ if (first_off == max_first_off) { // failed to find first byte
+ return (CharBoundsOff){ 0, 1 };
}
}
- if (utf8len_tab[p[-j]] != i + j + 1) {
- return 0;
+ int const max_end_off = utf8len_tab[p[first_off]] + first_off;
+ if (max_end_off <= 0 || max_end_off > p_len) { // illegal or incomplete sequence
+ return (CharBoundsOff){ 0, 1 };
}
- return i;
-}
-/// Return the offset from "p" to the first byte of the codepoint it points
-/// to. Can start anywhere in a stream of bytes.
-/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately.
-///
-/// @param[in] base Pointer to start of string
-/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
-//
-/// @return 0 if invalid sequence, else number of bytes to previous codepoint
-int utf_cp_head_off(const char *base, const char *p)
-{
- int i;
-
- if (*p == NUL) {
- return 0;
- }
-
- // Find the first character that is not 10xx.xxxx
- for (i = 0; p - i >= base; i++) {
- if (((uint8_t)p[-i] & 0xc0) != 0x80) {
- break;
+ for (int end_off = 1; end_off < max_end_off; end_off++) {
+ if (!utf_is_trail_byte(p[end_off])) { // not enough trail bytes
+ return (CharBoundsOff){ 0, 1 };
}
}
- // Find the last character that is 10xx.xxxx (condition terminates on NUL)
- int j = 1;
- while (((uint8_t)p[j] & 0xc0) == 0x80) {
- j++;
- }
+ return (CharBoundsOff){ .begin_off = (int8_t)-first_off, .end_off = (int8_t)max_end_off };
+}
- // Check for illegal sequence.
- if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
- return 0;
- }
- return i;
+/// Returns the offset in bytes from "p_in" to the first and one-past-end bytes
+/// of the codepoint it points to.
+/// "p_in" can point anywhere in a stream of bytes.
+/// Stream must be NUL-terminated.
+/// Note: Counts individual codepoints of composed characters separately.
+CharBoundsOff utf_cp_bounds(char const *base, char const *p_in)
+ FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL
+{
+ return utf_cp_bounds_len(base, p_in, INT_MAX);
}
// Find the next illegal byte sequence.
@@ -2250,7 +2361,7 @@ void *my_iconv_open(char *to, char *from)
// stops for no apparent reason after about 8160 characters.
char *p = tobuf;
size_t tolen = ICONV_TESTLEN;
- (void)iconv(fd, NULL, NULL, &p, &tolen);
+ iconv(fd, NULL, NULL, &p, &tolen);
if (p == NULL) {
iconv_working = kBroken;
iconv_close(fd);
@@ -2651,8 +2762,10 @@ static int tv_nr_compare(const void *a1, const void *a2)
{
const listitem_T *const li1 = tv_list_first(*(const list_T **)a1);
const listitem_T *const li2 = tv_list_first(*(const list_T **)a2);
+ const varnumber_T n1 = TV_LIST_ITEM_TV(li1)->vval.v_number;
+ const varnumber_T n2 = TV_LIST_ITEM_TV(li2)->vval.v_number;
- return (int)(TV_LIST_ITEM_TV(li1)->vval.v_number - TV_LIST_ITEM_TV(li2)->vval.v_number);
+ return n1 == n2 ? 0 : n1 > n2 ? 1 : -1;
}
/// "setcellwidths()" function
@@ -2802,3 +2915,14 @@ char *get_encoding_name(expand_T *xp FUNC_ATTR_UNUSED, int idx)
return (char *)enc_canon_table[idx].name;
}
+
+/// Compare strings
+///
+/// @param[in] ic True if case is to be ignored.
+///
+/// @return 0 if s1 == s2, <0 if s1 < s2, >0 if s1 > s2.
+int mb_strcmp_ic(bool ic, const char *s1, const char *s2)
+ FUNC_ATTR_NONNULL_ALL FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
+{
+ return (ic ? mb_stricmp(s1, s2) : strcmp(s1, s2));
+}