aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c464
1 files changed, 304 insertions, 160 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index a345795bbe..01e720283e 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -32,6 +32,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <utf8proc.h>
#include <uv.h>
#include <wctype.h>
@@ -43,6 +44,7 @@
#include "nvim/cmdexpand_defs.h"
#include "nvim/cursor.h"
#include "nvim/drawscreen.h"
+#include "nvim/errors.h"
#include "nvim/eval/typval.h"
#include "nvim/eval/typval_defs.h"
#include "nvim/getchar.h"
@@ -83,7 +85,6 @@ struct interval {
// uncrustify:off
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.c.generated.h"
-# include "unicode_tables.generated.h"
#endif
// uncrustify:on
@@ -442,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
return utf_class_tab(utf_ptr2char(p), chartab);
}
-// Return true if "c" is in "table".
-static bool intable(const struct interval *table, size_t n_items, int c)
- FUNC_ATTR_PURE
+static bool prop_is_emojilike(const utf8proc_property_t *prop)
{
- assert(n_items > 0);
- // first quick check for Latin1 etc. characters
- if (c < table[0].first) {
- return false;
- }
-
- assert(n_items <= SIZE_MAX / 2);
- // binary search in table
- size_t bot = 0;
- size_t top = n_items;
- do {
- size_t mid = (bot + top) >> 1;
- if (table[mid].last < c) {
- bot = mid + 1;
- } else if (table[mid].first > c) {
- top = mid;
- } else {
- return true;
- }
- } while (top > bot);
- return false;
+ return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+ || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR;
}
/// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
@@ -494,13 +474,18 @@ int utf_char2cells(int c)
return n;
}
- if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+
+ if (prop->charwidth == 2) {
return 2;
}
- if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
+ if (*p_ambw == 'd' && prop->ambiguous_width) {
return 2;
}
- if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
+
+ // Characters below 1F000 may be considered single width traditionally,
+ // making them double width causes problems.
+ if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) {
return 2;
}
@@ -509,31 +494,43 @@ int utf_char2cells(int c)
/// Return the number of display cells character at "*p" occupies.
/// This doesn't take care of unprintable characters, use ptr2cells() for that.
-int utf_ptr2cells(const char *p)
+int utf_ptr2cells(const char *p_in)
{
+ const uint8_t *p = (const uint8_t *)p_in;
// Need to convert to a character number.
- if ((uint8_t)(*p) >= 0x80) {
- int c = utf_ptr2char(p);
+ if ((*p) >= 0x80) {
+ int len = utf8len_tab[*p];
+ int32_t c = utf_ptr2CharInfo_impl(p, (uintptr_t)len);
// An illegal byte is displayed as <xx>.
- if (utf_ptr2len(p) == 1 || c == NUL) {
+ if (c <= 0) {
return 4;
}
// If the char is ASCII it must be an overlong sequence.
if (c < 0x80) {
return char2cells(c);
}
- return utf_char2cells(c);
+ int cells = utf_char2cells(c);
+ if (cells == 1 && p_emoji
+ && prop_is_emojilike(utf8proc_get_property(c))) {
+ int c2 = utf_ptr2char(p_in + len);
+ if (c2 == 0xFE0F) {
+ return 2; // emoji presentation
+ }
+ }
+ return cells;
}
return 1;
}
/// Convert a UTF-8 byte sequence to a character number.
-/// Doesn't handle ascii! only multibyte and illegal sequences.
+/// Doesn't handle ascii! only multibyte and illegal sequences. ASCII (including NUL)
+/// are treated like illegal sequences.
///
/// @param[in] p String to convert.
/// @param[in] len Length of the character in bytes, 0 or 1 if illegal.
///
-/// @return Unicode codepoint. A negative value when the sequence is illegal.
+/// @return Unicode codepoint. A negative value when the sequence is illegal (or
+/// ASCII, including NUL).
int32_t utf_ptr2CharInfo_impl(uint8_t const *p, uintptr_t const len)
FUNC_ATTR_PURE FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT
{
@@ -601,7 +598,8 @@ int utf_ptr2cells_len(const char *p, int size)
{
// Need to convert to a wide character.
if (size > 0 && (uint8_t)(*p) >= 0x80) {
- if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
+ int len = utf_ptr2len_len(p, size);
+ if (len < utf8len_tab[(uint8_t)(*p)]) {
return 1; // truncated
}
int c = utf_ptr2char(p);
@@ -613,7 +611,16 @@ int utf_ptr2cells_len(const char *p, int size)
if (c < 0x80) {
return char2cells(c);
}
- return utf_char2cells(c);
+ int cells = utf_char2cells(c);
+ if (cells == 1 && p_emoji && size > len
+ && prop_is_emojilike(utf8proc_get_property(c))
+ && utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
+ int c2 = utf_ptr2char(p + len);
+ if (c2 == 0xFE0F) {
+ return 2; // emoji presentation
+ }
+ }
+ return cells;
}
return 1;
}
@@ -646,8 +653,8 @@ size_t mb_string2cells_len(const char *str, size_t size)
size_t clen = 0;
for (const char *p = str; *p != NUL && p < str + size;
- p += utfc_ptr2len_len(p, (int)size + (int)(p - str))) {
- clen += (size_t)utf_ptr2cells(p);
+ p += utfc_ptr2len_len(p, (int)size - (int)(p - str))) {
+ clen += (size_t)utf_ptr2cells_len(p, (int)size - (int)(p - str));
}
return clen;
@@ -791,29 +798,48 @@ int mb_cptr2char_adv(const char **pp)
return c;
}
+/// When "c" is the first char of a string, determine if it needs to be prefixed
+/// by a space byte to be drawn correctly, and not merge with the space left of
+/// the string.
+bool utf_iscomposing_first(int c)
+{
+ return c >= 128 && !utf8proc_grapheme_break(' ', c);
+}
+
/// Check if the character pointed to by "p2" is a composing character when it
-/// comes after "p1". For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-bool utf_composinglike(const char *p1, const char *p2)
+/// comes after "p1".
+///
+/// We use the definition in UAX#29 as implemented by utf8proc with the following
+/// exceptions:
+///
+/// - ASCII chars always begin a new cluster. This is a long assumed invariant
+/// in the code base and very useful for performance (we can exit early for ASCII
+/// all over the place, branch predictor go brrr in ASCII-only text).
+/// As of Unicode 15.1 this will only break BOUNDCLASS_UREPEND followed by ASCII,
+/// which should be exceedingly rare (these PREPEND chars are expected to be
+/// followed by multibyte chars within the same script family)
+///
+/// - When 'arabicshape' is active, some pairs of arabic letters "ab" is replaced with
+/// "c" taking one single cell, which behaves like a cluster.
+///
+/// @param "state" should be set to GRAPHEME_STATE_INIT before first call
+/// it is allowed to be null, but will then not handle some longer
+/// sequences, like ZWJ based emoji
+bool utf_composinglike(const char *p1, const char *p2, GraphemeState *state)
+ FUNC_ATTR_NONNULL_ARG(1, 2)
{
- int c2 = utf_ptr2char(p2);
- if (utf_iscomposing(c2)) {
- return true;
- }
- if (!arabic_maycombine(c2)) {
+ if ((uint8_t)(*p2) < 128) {
return false;
}
- return arabic_combine(utf_ptr2char(p1), c2);
-}
-/// Check if the next character is a composing character when it
-/// comes after the first. For Arabic sometimes "ab" is replaced with "c", which
-/// behaves like a composing character.
-/// returns false for negative values
-bool utf_char_composinglike(int32_t const first, int32_t const next)
- FUNC_ATTR_PURE
-{
- return utf_iscomposing(next) || arabic_combine(first, next);
+ int first = utf_ptr2char(p1);
+ int second = utf_ptr2char(p2);
+
+ if (!utf8proc_grapheme_break_stateful(first, second, state)) {
+ return true;
+ }
+
+ return arabic_combine(first, second);
}
/// Get the screen char at the beginning of a string
@@ -832,7 +858,7 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
{
int c = utf_ptr2char(p);
*firstc = c; // NOT optional, you are gonna need it
- bool first_compose = utf_iscomposing(c);
+ bool first_compose = utf_iscomposing_first(c);
size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
@@ -843,16 +869,13 @@ schar_T utfc_ptr2schar(const char *p, int *firstc)
return schar_from_buf_first(p, len, first_compose);
}
-/// Get the screen char at the beginning of a string with length
+/// Get the screen char from a char with a known length
///
/// Like utfc_ptr2schar but use no more than p[maxlen].
-schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+schar_T utfc_ptrlen2schar(const char *p, int len, int *firstc)
FUNC_ATTR_NONNULL_ALL
{
- assert(maxlen > 0);
-
- size_t len = (size_t)utf_ptr2len_len(p, maxlen);
- if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+ if ((len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
// invalid or truncated sequence
*firstc = (uint8_t)(*p);
return 0;
@@ -860,11 +883,13 @@ schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
int c = utf_ptr2char(p);
*firstc = c;
- bool first_compose = utf_iscomposing(c);
- maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
- len = (size_t)utfc_ptr2len_len(p, maxlen);
+ bool first_compose = utf_iscomposing_first(c);
+ int maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+ if (len > maxlen) {
+ len = utfc_ptr2len_len(p, maxlen);
+ }
- return schar_from_buf_first(p, len, first_compose);
+ return schar_from_buf_first(p, (size_t)len, first_compose);
}
/// Caller must ensure there is space for `first_compose`
@@ -962,8 +987,9 @@ int utfc_ptr2len(const char *const p)
// Check for composing characters.
int prevlen = 0;
+ GraphemeState state = GRAPHEME_STATE_INIT;
while (true) {
- if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
+ if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len, &state)) {
return len;
}
@@ -994,9 +1020,10 @@ int utfc_ptr2len_len(const char *p, int size)
return 1;
}
- // Check for composing characters. We can handle only the first six, but
+ // Check for composing characters. We can only display a limited amount, but
// skip all of them (otherwise the cursor would get stuck).
int prevlen = 0;
+ GraphemeState state = GRAPHEME_STATE_INIT;
while (len < size) {
if ((uint8_t)p[len] < 0x80) {
break;
@@ -1009,7 +1036,7 @@ int utfc_ptr2len_len(const char *p, int size)
break;
}
- if (!utf_composinglike(p + prevlen, p + len)) {
+ if (!utf_composinglike(p + prevlen, p + len, &state)) {
break;
}
@@ -1082,13 +1109,21 @@ int utf_char2bytes(const int c, char *const buf)
}
}
-/// Return true if "c" is a composing UTF-8 character.
-/// This means it will be drawn on top of the preceding character.
+/// Return true if "c" is a legacy composing UTF-8 character.
+///
+/// This is deprecated in favour of utf_composinglike() which uses the modern
+/// stateful algorithm to determine grapheme clusters. Still available
+/// to support some legacy code which hasn't been refactored yet.
+///
+/// To check if a char would combine with a preceeding space, use
+/// utf_iscomposing_first() instead.
+///
/// Based on code from Markus Kuhn.
/// Returns false for negative values.
-bool utf_iscomposing(int c)
+bool utf_iscomposing_legacy(int c)
{
- return intable(combining, ARRAY_SIZE(combining), c);
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+ return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME;
}
#ifdef __SSE2__
@@ -1133,6 +1168,33 @@ bool utf_printable(int c)
#else
+// Return true if "c" is in "table".
+static bool intable(const struct interval *table, size_t n_items, int c)
+ FUNC_ATTR_PURE
+{
+ assert(n_items > 0);
+ // first quick check for Latin1 etc. characters
+ if (c < table[0].first) {
+ return false;
+ }
+
+ assert(n_items <= SIZE_MAX / 2);
+ // binary search in table
+ size_t bot = 0;
+ size_t top = n_items;
+ do {
+ size_t mid = (bot + top) >> 1;
+ if (table[mid].last < c) {
+ bot = mid + 1;
+ } else if (table[mid].first > c) {
+ top = mid;
+ } else {
+ return true;
+ }
+ } while (top > bot);
+ return false;
+}
+
// Return true for characters that can be displayed in a normal way.
// Only for characters of 0x100 and above!
bool utf_printable(int c)
@@ -1255,8 +1317,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 1; // punctuation
}
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
// emoji
- if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+ if (prop_is_emojilike(prop)) {
return 3;
}
@@ -1276,47 +1339,51 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 2;
}
-bool utf_ambiguous_width(int c)
+bool utf_ambiguous_width(const char *p)
{
- return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
- || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
-}
+ // be quick if there is nothing to print or ASCII-only
+ if (p[0] == NUL || p[1] == NUL) {
+ return false;
+ }
-// Generic conversion function for case operations.
-// Return the converted equivalent of "a", which is a UCS-4 character. Use
-// the given conversion "table". Uses binary search on "table".
-static int utf_convert(int a, const convertStruct *const table, size_t n_items)
-{
- // indices into table
- size_t start = 0;
- size_t end = n_items;
- while (start < end) {
- // need to search further
- size_t mid = (end + start) / 2;
- if (table[mid].rangeEnd < a) {
- start = mid + 1;
- } else {
- end = mid;
+ CharInfo info = utf_ptr2CharInfo(p);
+ if (info.value >= 0x80) {
+ const utf8proc_property_t *prop = utf8proc_get_property(info.value);
+ if (prop->ambiguous_width || prop_is_emojilike(prop)) {
+ return true;
}
}
- if (start < n_items
- && table[start].rangeStart <= a
- && a <= table[start].rangeEnd
- && (a - table[start].rangeStart) % table[start].step == 0) {
- return a + table[start].offset;
- }
- return a;
+
+ // check if second sequence is 0xFE0F VS-16 which can turn things into emoji,
+ // safe with NUL (no second sequence)
+ return memcmp(p + info.len, "\xef\xb8\x8f", 3) == 0;
}
// Return the folded-case equivalent of "a", which is a UCS-4 character. Uses
-// simple case folding.
+// full case folding.
int utf_fold(int a)
{
if (a < 0x80) {
// be fast for ASCII
return a >= 0x41 && a <= 0x5a ? a + 32 : a;
}
- return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
+
+ // TODO(dundargoc): utf8proc only does full case folding, which breaks some tests. This is a
+ // temporary workaround to circumvent failing tests.
+ //
+ // (0xdf) ß == ss in full casefolding. Using this however breaks the vim spell tests and the error
+ // E763 is thrown. This is due to the test spells relying on the vim spell files.
+ //
+ // (0x130) İ == i̇ in full casefolding.
+ if (a == 0xdf || a == 0x130) {
+ return a;
+ }
+
+ utf8proc_int32_t result[1];
+
+ utf8proc_ssize_t res = utf8proc_decompose_char(a, result, 1, UTF8PROC_CASEFOLD, NULL);
+
+ return (res == 1) ? result[0] : a;
}
// Vim's own character class functions. These exist because many library
@@ -1324,9 +1391,6 @@ int utf_fold(int a)
// invalid values or can't handle latin1 when the locale is C.
// Speed is most important here.
-// Note: UnicodeData.txt does not define U+1E9E as being the corresponding upper
-// case letter for U+00DF (ß), however it is part of the toLower table
-
/// Return the upper-case equivalent of "a", which is a UCS-4 character. Use
/// simple case folding.
int mb_toupper(int a)
@@ -1345,14 +1409,12 @@ int mb_toupper(int a)
return TOUPPER_LOC(a);
}
- // For any other characters use the above mapping table.
- return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
+ return utf8proc_toupper(a);
}
bool mb_islower(int a)
{
- // German sharp s is lower case but has no upper case equivalent.
- return (mb_toupper(a) != a) || a == 0xdf;
+ return mb_toupper(a) != a;
}
/// Return the lower-case equivalent of "a", which is a UCS-4 character. Use
@@ -1373,8 +1435,7 @@ int mb_tolower(int a)
return TOLOWER_LOC(a);
}
- // For any other characters use the above mapping table.
- return utf_convert(a, toLower, ARRAY_SIZE(toLower));
+ return utf8proc_tolower(a);
}
bool mb_isupper(int a)
@@ -1388,7 +1449,7 @@ bool mb_isalpha(int a)
return mb_islower(a) || mb_isupper(a);
}
-static int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
+int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
{
int c1, c2;
char buffer[6];
@@ -1545,7 +1606,7 @@ int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
return uv_translate_sys_error(GetLastError());
}
- (*utf8)[bufsize] = '\0';
+ (*utf8)[bufsize] = NUL;
return 0;
}
@@ -1673,6 +1734,26 @@ void show_utf8(void)
msg(IObuff, 0);
}
+/// @return true if boundclass bc always starts a new cluster regardless of what's before
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break(int bc)
+{
+ return (bc == UTF8PROC_BOUNDCLASS_CONTROL);
+}
+
+/// @return true if bc2 always starts a cluster after bc1
+/// false negatives are allowed (perf cost, not correctness)
+static bool always_break_two(int bc1, int bc2)
+{
+ // don't check for UTF8PROC_BOUNDCLASS_CONTROL for bc2 as it either has been checked by
+ // "always_break" on first iteration or when it was bc1 in the previous iteration
+ return ((bc1 != UTF8PROC_BOUNDCLASS_PREPEND && bc2 == UTF8PROC_BOUNDCLASS_OTHER)
+ || (bc1 >= UTF8PROC_BOUNDCLASS_CR && bc1 <= UTF8PROC_BOUNDCLASS_CONTROL)
+ || (bc2 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+ && (bc1 == UTF8PROC_BOUNDCLASS_OTHER
+ || bc1 == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC)));
+}
+
/// Return offset from "p" to the start of a character, including composing characters.
/// "base" must be the start of the string, which must be NUL terminated.
/// If "p" points to the NUL at the end of the string return 0.
@@ -1686,50 +1767,111 @@ int utf_head_off(const char *base_in, const char *p_in)
const uint8_t *base = (uint8_t *)base_in;
const uint8_t *p = (uint8_t *)p_in;
- // Skip backwards over trailing bytes: 10xx.xxxx
- // Skip backwards again if on a composing char.
- const uint8_t *q;
- for (q = p;; q--) {
- // Move s to the last byte of this char.
- const uint8_t *s;
- for (s = q; (s[1] & 0xc0) == 0x80; s++) {}
-
- // Move q to the first byte of this char.
- while (q > base && (*q & 0xc0) == 0x80) {
- q--;
- }
- // Check for illegal sequence. Do allow an illegal byte after where we
- // started.
- int len = utf8len_tab[*q];
- if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
- return 0;
+ const uint8_t *start = p;
+
+ // move start to the first byte of this codepoint
+ // might stop on a continuation byte if overlong, handled by utf_ptr2CharInfo_impl
+ while (start > base && (*start & 0xc0) == 0x80 && (p - start) < 6) {
+ start--;
+ }
+
+ const uint8_t last_len = utf8len_tab[*start];
+ int32_t cur_code = utf_ptr2CharInfo_impl(start, (uintptr_t)last_len);
+ if (cur_code < 0 || p - start >= last_len) {
+ return 0; // p must be part of an illegal sequence
+ }
+ const uint8_t * const safe_end = start + last_len;
+
+ int cur_bc = utf8proc_get_property(cur_code)->boundclass;
+ if (always_break(cur_bc) || start == base) {
+ return (int)(p - start);
+ }
+
+ // backtrack to find the start of a cluster. we might go too far, checked in the next loop
+ const uint8_t *cur_pos = start;
+ const uint8_t *const p_start = start;
+
+ while (true) {
+ if (start[-1] == NUL) {
+ break;
}
- if (q <= base) {
+ start--;
+ if (*start < 0x80) { // stop on ascii, we are done
break;
}
- int c = utf_ptr2char((char *)q);
- if (utf_iscomposing(c)) {
- continue;
+ while (start > base && (*start & 0xc0) == 0x80 && (cur_pos - start) < 6) {
+ start--;
}
- if (arabic_maycombine(c)) {
- // Advance to get a sneak-peak at the next char
- const uint8_t *j = q;
- j--;
- // Move j to the first byte of this char.
- while (j > base && (*j & 0xc0) == 0x80) {
- j--;
- }
- if (arabic_combine(utf_ptr2char((char *)j), c)) {
- continue;
- }
+ int prev_len = utf8len_tab[*start];
+ int32_t prev_code = utf_ptr2CharInfo_impl(start, (uintptr_t)prev_len);
+ if (prev_code < 0 || prev_len < cur_pos - start) {
+ start = cur_pos; // start at valid sequence after invalid bytes
+ break;
}
- break;
+
+ int prev_bc = utf8proc_get_property(prev_code)->boundclass;
+ if (always_break_two(prev_bc, cur_bc) && !arabic_combine(prev_code, cur_code)) {
+ start = cur_pos; // prev_code cannot be a part of this cluster
+ break;
+ } else if (start == base) {
+ break;
+ }
+ cur_pos = start;
+ cur_bc = prev_bc;
+ cur_code = prev_code;
+ }
+
+ // hot path: we are already on the first codepoint of a sequence
+ if (start == p_start && last_len > p - start) {
+ return (int)(p - start);
}
- return (int)(p - q);
+ const uint8_t *q = start;
+ while (q < p) {
+ // don't need to find end of cluster. once we reached the codepoint of p, we are done
+ int len = utfc_ptr2len_len((const char *)q, (int)(safe_end - q));
+
+ if (q + len > p) {
+ return (int)(p - q);
+ }
+
+ q += len;
+ }
+
+ return 0;
+}
+
+/// Assumes caller already handles ascii. see `utfc_next`
+StrCharInfo utfc_next_impl(StrCharInfo cur)
+{
+ int32_t prev_code = cur.chr.value;
+ uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
+ GraphemeState state = GRAPHEME_STATE_INIT;
+ assert(*next >= 0x80);
+
+ while (true) {
+ uint8_t const next_len = utf8len_tab[*next];
+ int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
+ if (utf8proc_grapheme_break_stateful(prev_code, next_code, &state)
+ && !arabic_combine(prev_code, next_code)) {
+ return (StrCharInfo){
+ .ptr = (char *)next,
+ .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
+ };
+ }
+
+ prev_code = next_code;
+ next += next_len;
+ if (EXPECT(*next < 0x80U, true)) {
+ return (StrCharInfo){
+ .ptr = (char *)next,
+ .chr = (CharInfo){ .value = *next, .len = 1 },
+ };
+ }
+ }
}
// Whether space is NOT allowed before/after 'c'.
@@ -2688,7 +2830,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
c = 0x100; break; // not in latin9
}
}
- if (!utf_iscomposing(c)) { // skip composing chars
+ if (!utf_iscomposing_legacy(c)) { // skip composing chars
if (c < 0x100) {
*d++ = (uint8_t)c;
} else if (vcp->vc_fail) {
@@ -2776,17 +2918,17 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
emsg(_(e_listreq));
return;
}
+
const list_T *const l = argvars[0].vval.v_list;
- if (tv_list_len(l) == 0) {
+ cw_interval_T *table = NULL;
+ const size_t table_size = (size_t)tv_list_len(l);
+ if (table_size == 0) {
// Clearing the table.
- xfree(cw_table);
- cw_table = NULL;
- cw_table_size = 0;
- return;
+ goto update;
}
// Note: use list_T instead of listitem_T so that TV_LIST_ITEM_NEXT can be used properly below.
- const list_T **ptrs = xmalloc(sizeof(const list_T *) * (size_t)tv_list_len(l));
+ const list_T **ptrs = xmalloc(sizeof(const list_T *) * table_size);
// Check that all entries are a list with three numbers, the range is
// valid and the cell width is valid.
@@ -2838,12 +2980,12 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
});
// Sort the list on the first number.
- qsort((void *)ptrs, (size_t)tv_list_len(l), sizeof(const list_T *), tv_nr_compare);
+ qsort((void *)ptrs, table_size, sizeof(const list_T *), tv_nr_compare);
- cw_interval_T *table = xmalloc(sizeof(cw_interval_T) * (size_t)tv_list_len(l));
+ table = xmalloc(sizeof(cw_interval_T) * table_size);
// Store the items in the new table.
- for (item = 0; item < tv_list_len(l); item++) {
+ for (item = 0; (size_t)item < table_size; item++) {
const list_T *const li_l = ptrs[item];
const listitem_T *lili = tv_list_first(li_l);
const varnumber_T n1 = TV_LIST_ITEM_TV(lili)->vval.v_number;
@@ -2862,10 +3004,12 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
xfree((void *)ptrs);
+update:
+ ;
cw_interval_T *const cw_table_save = cw_table;
const size_t cw_table_size_save = cw_table_size;
cw_table = table;
- cw_table_size = (size_t)tv_list_len(l);
+ cw_table_size = table_size;
// Check that the new value does not conflict with 'listchars' or
// 'fillchars'.