aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
authorbfredl <bjorn.linse@gmail.com>2024-08-14 10:10:54 +0200
committerbfredl <bjorn.linse@gmail.com>2024-08-31 18:09:38 +0200
commit26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9 (patch)
tree117020f129a2d7bbb2307d32c23aa868a7933bb6 /src/nvim/mbyte.c
parenta6c4487e8bfc8dc527ed64651515963e46ebeee8 (diff)
downloadrneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.tar.gz
rneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.tar.bz2
rneovim-26be6446e5ea1c5b22c50bfd9a0e5aa85927aff9.zip
refactor(multibyte): replace generated unicode tables with utf8proc
This commit intentionally aims at preserving existing behavior as much as possible while replacing our build step to convert unicode data files into binary tables, which corresponding lookups in utf8proc. Actual improvements in behavior will be a followup. The only change in behavior is that 'emoji' option will turn some more codepoints into double with. Nvim used the "Emoji" and "Emoji_Presentation" properties to define emojis, while utf8proc only exposes the Extended_Pictographic property from the emoji table. This is a superset of the previous emoji properties. As only codepoints above 0x1f000 are affected by the 'emoji' option, this means that the following chars are now treated as double-width, instead of single-width like in previous nvim versions: ๐Ÿ€€ ๐Ÿ€ ๐Ÿ€‚ ๐Ÿ€ƒ ๐Ÿ€… ๐Ÿ€† ๐Ÿ€‡ ๐Ÿ€ˆ ๐Ÿ€‰ ๐Ÿ€Š ๐Ÿ€‹ ๐Ÿ€Œ ๐Ÿ€ ๐Ÿ€Ž ๐Ÿ€ ๐Ÿ€ ๐Ÿ€‘ ๐Ÿ€’ ๐Ÿ€“ ๐Ÿ€” ๐Ÿ€• ๐Ÿ€– ๐Ÿ€— ๐Ÿ€˜ ๐Ÿ€™ ๐Ÿ€š ๐Ÿ€› ๐Ÿ€œ ๐Ÿ€ ๐Ÿ€ž ๐Ÿ€Ÿ ๐Ÿ€  ๐Ÿ€ก ๐Ÿ€ข ๐Ÿ€ฃ ๐Ÿ€ค ๐Ÿ€ฅ ๐Ÿ€ฆ ๐Ÿ€ง ๐Ÿ€จ ๐Ÿ€ฉ ๐Ÿ€ช ๐Ÿ€ซ ๐Ÿ€ฐ ๐Ÿ€ฑ ๐Ÿ€ฒ ๐Ÿ€ณ ๐Ÿ€ด ๐Ÿ€ต ๐Ÿ€ถ ๐Ÿ€ท ๐Ÿ€ธ ๐Ÿ€น ๐Ÿ€บ ๐Ÿ€ป ๐Ÿ€ผ ๐Ÿ€ฝ ๐Ÿ€พ ๐Ÿ€ฟ ๐Ÿ€ ๐Ÿ ๐Ÿ‚ ๐Ÿƒ ๐Ÿ„ ๐Ÿ… ๐Ÿ† ๐Ÿ‡ ๐Ÿˆ ๐Ÿ‰ ๐ŸŠ ๐Ÿ‹ ๐ŸŒ ๐Ÿ ๐ŸŽ ๐Ÿ ๐Ÿ ๐Ÿ‘ ๐Ÿ’ ๐Ÿ“ ๐Ÿ” ๐Ÿ• ๐Ÿ– ๐Ÿ— ๐Ÿ˜ ๐Ÿ™ ๐Ÿš ๐Ÿ› ๐Ÿœ ๐Ÿ ๐Ÿž ๐ŸŸ ๐Ÿ  ๐Ÿก ๐Ÿข ๐Ÿฃ ๐Ÿค ๐Ÿฅ ๐Ÿฆ ๐Ÿง ๐Ÿจ ๐Ÿฉ ๐Ÿช ๐Ÿซ ๐Ÿฌ ๐Ÿญ ๐Ÿฎ ๐Ÿฏ ๐Ÿฐ ๐Ÿฑ ๐Ÿฒ ๐Ÿณ ๐Ÿด ๐Ÿต ๐Ÿถ ๐Ÿท ๐Ÿธ ๐Ÿน ๐Ÿบ ๐Ÿป ๐Ÿผ ๐Ÿฝ ๐Ÿพ ๐Ÿฟ ๐Ÿ‚€ ๐Ÿ‚ ๐Ÿ‚‚ ๐Ÿ‚ƒ ๐Ÿ‚„ ๐Ÿ‚… ๐Ÿ‚† ๐Ÿ‚‡ ๐Ÿ‚ˆ ๐Ÿ‚‰ ๐Ÿ‚Š ๐Ÿ‚‹ ๐Ÿ‚Œ ๐Ÿ‚ ๐Ÿ‚Ž ๐Ÿ‚ ๐Ÿ‚ ๐Ÿ‚‘ ๐Ÿ‚’ ๐Ÿ‚“ ๐Ÿ‚  ๐Ÿ‚ก ๐Ÿ‚ข ๐Ÿ‚ฃ ๐Ÿ‚ค ๐Ÿ‚ฅ ๐Ÿ‚ฆ ๐Ÿ‚ง ๐Ÿ‚จ ๐Ÿ‚ฉ ๐Ÿ‚ช ๐Ÿ‚ซ ๐Ÿ‚ฌ ๐Ÿ‚ญ ๐Ÿ‚ฎ ๐Ÿ‚ฑ ๐Ÿ‚ฒ ๐Ÿ‚ณ ๐Ÿ‚ด ๐Ÿ‚ต ๐Ÿ‚ถ ๐Ÿ‚ท ๐Ÿ‚ธ ๐Ÿ‚น ๐Ÿ‚บ ๐Ÿ‚ป ๐Ÿ‚ผ ๐Ÿ‚ฝ ๐Ÿ‚พ ๐Ÿ‚ฟ ๐Ÿƒ ๐Ÿƒ‚ ๐Ÿƒƒ ๐Ÿƒ„ ๐Ÿƒ… ๐Ÿƒ† ๐Ÿƒ‡ ๐Ÿƒˆ ๐Ÿƒ‰ ๐ŸƒŠ ๐Ÿƒ‹ ๐ŸƒŒ ๐Ÿƒ ๐ŸƒŽ ๐Ÿƒ‘ ๐Ÿƒ’ ๐Ÿƒ“ ๐Ÿƒ” ๐Ÿƒ• ๐Ÿƒ– ๐Ÿƒ— ๐Ÿƒ˜ ๐Ÿƒ™ ๐Ÿƒš ๐Ÿƒ› ๐Ÿƒœ ๐Ÿƒ ๐Ÿƒž ๐ŸƒŸ ๐Ÿƒ  ๐Ÿƒก ๐Ÿƒข ๐Ÿƒฃ ๐Ÿƒค ๐Ÿƒฅ ๐Ÿƒฆ ๐Ÿƒง ๐Ÿƒจ ๐Ÿƒฉ ๐Ÿƒช ๐Ÿƒซ ๐Ÿƒฌ ๐Ÿƒญ ๐Ÿƒฎ ๐Ÿƒฏ ๐Ÿƒฐ ๐Ÿƒฑ ๐Ÿƒฒ ๐Ÿƒณ ๐Ÿƒด ๐Ÿƒต ๐Ÿ„ ๐Ÿ„Ž ๐Ÿ„ ๐Ÿ„ฏ ๐Ÿ…ฌ ๐Ÿ…ญ ๐Ÿ…ฎ ๐Ÿ…ฏ ๐Ÿ†ญ ๐ŸŒข ๐ŸŒฃ ๐ŸŽ” ๐ŸŽ• ๐ŸŽ˜ ๐ŸŽœ ๐ŸŽ ๐Ÿฑ ๐Ÿฒ ๐Ÿถ ๐Ÿ“พ ๐Ÿ•† ๐Ÿ•‡ ๐Ÿ•ˆ ๐Ÿ• ๐Ÿ•จ ๐Ÿ•ฉ ๐Ÿ•ช ๐Ÿ•ซ ๐Ÿ•ฌ ๐Ÿ•ญ ๐Ÿ•ฎ ๐Ÿ•ฑ ๐Ÿ•ฒ ๐Ÿ•ป ๐Ÿ•ผ ๐Ÿ•ฝ ๐Ÿ•พ ๐Ÿ•ฟ ๐Ÿ–€ ๐Ÿ– ๐Ÿ–‚ ๐Ÿ–ƒ ๐Ÿ–„ ๐Ÿ–… ๐Ÿ–† ๐Ÿ–ˆ ๐Ÿ–‰ ๐Ÿ–Ž ๐Ÿ– ๐Ÿ–‘ ๐Ÿ–’ ๐Ÿ–“ ๐Ÿ–” ๐Ÿ–— ๐Ÿ–˜ ๐Ÿ–™ ๐Ÿ–š ๐Ÿ–› ๐Ÿ–œ ๐Ÿ– ๐Ÿ–ž ๐Ÿ–Ÿ ๐Ÿ–  ๐Ÿ–ก ๐Ÿ–ข ๐Ÿ–ฃ ๐Ÿ–ฆ ๐Ÿ–ง ๐Ÿ–ฉ ๐Ÿ–ช ๐Ÿ–ซ ๐Ÿ–ฌ ๐Ÿ–ญ ๐Ÿ–ฎ ๐Ÿ–ฏ ๐Ÿ–ฐ ๐Ÿ–ณ ๐Ÿ–ด ๐Ÿ–ต ๐Ÿ–ถ ๐Ÿ–ท ๐Ÿ–ธ ๐Ÿ–น ๐Ÿ–บ ๐Ÿ–ป ๐Ÿ–ฝ ๐Ÿ–พ ๐Ÿ–ฟ ๐Ÿ—€ ๐Ÿ— ๐Ÿ—… ๐Ÿ—† ๐Ÿ—‡ ๐Ÿ—ˆ ๐Ÿ—‰ ๐Ÿ—Š ๐Ÿ—‹ ๐Ÿ—Œ ๐Ÿ— ๐Ÿ—Ž ๐Ÿ— ๐Ÿ— ๐Ÿ—” ๐Ÿ—• ๐Ÿ—– ๐Ÿ—— ๐Ÿ—˜ ๐Ÿ—™ ๐Ÿ—š ๐Ÿ—› ๐Ÿ—Ÿ ๐Ÿ—  ๐Ÿ—ข ๐Ÿ—ค ๐Ÿ—ฅ ๐Ÿ—ฆ ๐Ÿ—ง ๐Ÿ—ฉ ๐Ÿ—ช ๐Ÿ—ซ ๐Ÿ—ฌ ๐Ÿ—ญ ๐Ÿ—ฎ ๐Ÿ—ฐ ๐Ÿ—ฑ ๐Ÿ—ฒ ๐Ÿ—ด ๐Ÿ—ต ๐Ÿ—ถ ๐Ÿ—ท ๐Ÿ—ธ ๐Ÿ—น ๐Ÿ›† ๐Ÿ›‡ ๐Ÿ›ˆ ๐Ÿ›‰ ๐Ÿ›Š ๐Ÿ›“ ๐Ÿ›” ๐Ÿ›ฆ ๐Ÿ›ง ๐Ÿ›จ ๐Ÿ›ช ๐Ÿ›ฑ ๐Ÿ›ฒ ๐Ÿด ๐Ÿต ๐Ÿถ ๐Ÿป ๐Ÿผ ๐Ÿฝ ๐Ÿพ ๐Ÿฟ ๐ŸŸ• ๐ŸŸ– ๐ŸŸ— ๐ŸŸ˜ ๐ŸŸ™ ๐Ÿขฐ ๐Ÿขฑ ๐Ÿจ€ ๐Ÿจ ๐Ÿจ‚ ๐Ÿจƒ ๐Ÿจ„ ๐Ÿจ… ๐Ÿจ† ๐Ÿจ‡ ๐Ÿจˆ ๐Ÿจ‰ ๐ŸจŠ ๐Ÿจ‹ ๐ŸจŒ ๐Ÿจ ๐ŸจŽ ๐Ÿจ ๐Ÿจ ๐Ÿจ‘ ๐Ÿจ’ ๐Ÿจ“ ๐Ÿจ” ๐Ÿจ• ๐Ÿจ– ๐Ÿจ— ๐Ÿจ˜ ๐Ÿจ™ ๐Ÿจš ๐Ÿจ› ๐Ÿจœ ๐Ÿจ ๐Ÿจž ๐ŸจŸ ๐Ÿจ  ๐Ÿจก ๐Ÿจข ๐Ÿจฃ ๐Ÿจค ๐Ÿจฅ ๐Ÿจฆ ๐Ÿจง ๐Ÿจจ ๐Ÿจฉ ๐Ÿจช ๐Ÿจซ ๐Ÿจฌ ๐Ÿจญ ๐Ÿจฎ ๐Ÿจฏ ๐Ÿจฐ ๐Ÿจฑ ๐Ÿจฒ ๐Ÿจณ ๐Ÿจด ๐Ÿจต ๐Ÿจถ ๐Ÿจท ๐Ÿจธ ๐Ÿจน ๐Ÿจบ ๐Ÿจป ๐Ÿจผ ๐Ÿจฝ ๐Ÿจพ ๐Ÿจฟ ๐Ÿฉ€ ๐Ÿฉ ๐Ÿฉ‚ ๐Ÿฉƒ ๐Ÿฉ„ ๐Ÿฉ… ๐Ÿฉ† ๐Ÿฉ‡ ๐Ÿฉˆ ๐Ÿฉ‰ ๐ŸฉŠ ๐Ÿฉ‹ ๐ŸฉŒ ๐Ÿฉ ๐ŸฉŽ ๐Ÿฉ ๐Ÿฉ ๐Ÿฉ‘ ๐Ÿฉ’ ๐Ÿฉ“ ๐Ÿฉ  ๐Ÿฉก ๐Ÿฉข ๐Ÿฉฃ ๐Ÿฉค ๐Ÿฉฅ ๐Ÿฉฆ ๐Ÿฉง ๐Ÿฉจ ๐Ÿฉฉ ๐Ÿฉช ๐Ÿฉซ ๐Ÿฉฌ ๐Ÿฉญ
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c84
1 files changed, 50 insertions, 34 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 666a904fc5..db4730408b 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -85,7 +85,6 @@ struct interval {
// uncrustify:off
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.c.generated.h"
-# include "unicode_tables.generated.h"
#endif
// uncrustify:on
@@ -444,31 +443,10 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
return utf_class_tab(utf_ptr2char(p), chartab);
}
-// Return true if "c" is in "table".
-static bool intable(const struct interval *table, size_t n_items, int c)
- FUNC_ATTR_PURE
+static bool prop_is_emojilike(const utf8proc_property_t *prop)
{
- assert(n_items > 0);
- // first quick check for Latin1 etc. characters
- if (c < table[0].first) {
- return false;
- }
-
- assert(n_items <= SIZE_MAX / 2);
- // binary search in table
- size_t bot = 0;
- size_t top = n_items;
- do {
- size_t mid = (bot + top) >> 1;
- if (table[mid].last < c) {
- bot = mid + 1;
- } else if (table[mid].first > c) {
- top = mid;
- } else {
- return true;
- }
- } while (top > bot);
- return false;
+ return prop->boundclass == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC
+ || prop->boundclass == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR;
}
/// For UTF-8 character "c" return 2 for a double-width character, 1 for others.
@@ -496,13 +474,18 @@ int utf_char2cells(int c)
return n;
}
- if (intable(doublewidth, ARRAY_SIZE(doublewidth), c)) {
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+
+ if (prop->charwidth == 2) {
return 2;
}
- if (p_emoji && intable(emoji_wide, ARRAY_SIZE(emoji_wide), c)) {
+ if (*p_ambw == 'd' && prop->ambiguous_width) {
return 2;
}
- if (*p_ambw == 'd' && intable(ambiguous, ARRAY_SIZE(ambiguous), c)) {
+
+ // Characters below 1F000 may be considered single width traditionally,
+ // making them double width causes problems.
+ if (p_emoji && c >= 0x1f000 && !prop->ambiguous_width && prop_is_emojilike(prop)) {
return 2;
}
@@ -528,7 +511,7 @@ int utf_ptr2cells(const char *p_in)
}
int cells = utf_char2cells(c);
if (cells == 1 && p_emoji
- && intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+ && prop_is_emojilike(utf8proc_get_property(c))) {
int c2 = utf_ptr2char(p_in + len);
if (c2 == 0xFE0F) {
return 2; // emoji presentation
@@ -628,7 +611,7 @@ int utf_ptr2cells_len(const char *p, int size)
}
int cells = utf_char2cells(c);
if (cells == 1 && p_emoji && size > len
- && intable(emoji_all, ARRAY_SIZE(emoji_all), c)
+ && prop_is_emojilike(utf8proc_get_property(c))
&& utf_ptr2len_len(p + len, size - len) == utf8len_tab[(uint8_t)p[len]]) {
int c2 = utf_ptr2char(p + len);
if (c2 == 0xFE0F) {
@@ -1137,7 +1120,8 @@ int utf_char2bytes(const int c, char *const buf)
/// Returns false for negative values.
bool utf_iscomposing_legacy(int c)
{
- return intable(combining, ARRAY_SIZE(combining), c);
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+ return prop->category == UTF8PROC_CATEGORY_MN || prop->category == UTF8PROC_CATEGORY_ME;
}
#ifdef __SSE2__
@@ -1182,6 +1166,33 @@ bool utf_printable(int c)
#else
+// Return true if "c" is in "table".
+static bool intable(const struct interval *table, size_t n_items, int c)
+ FUNC_ATTR_PURE
+{
+ assert(n_items > 0);
+ // first quick check for Latin1 etc. characters
+ if (c < table[0].first) {
+ return false;
+ }
+
+ assert(n_items <= SIZE_MAX / 2);
+ // binary search in table
+ size_t bot = 0;
+ size_t top = n_items;
+ do {
+ size_t mid = (bot + top) >> 1;
+ if (table[mid].last < c) {
+ bot = mid + 1;
+ } else if (table[mid].first > c) {
+ top = mid;
+ } else {
+ return true;
+ }
+ } while (top > bot);
+ return false;
+}
+
// Return true for characters that can be displayed in a normal way.
// Only for characters of 0x100 and above!
bool utf_printable(int c)
@@ -1304,8 +1315,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 1; // punctuation
}
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
// emoji
- if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+ if (prop_is_emojilike(prop)) {
return 3;
}
@@ -1328,8 +1340,12 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
bool utf_ambiguous_width(const char *p)
{
int c = utf_ptr2char(p);
- return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
- || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
+ if (c < 0x80) {
+ return false;
+ }
+
+ const utf8proc_property_t *prop = utf8proc_get_property(c);
+ return prop->ambiguous_width || prop_is_emojilike(prop);
}
// Return the folded-case equivalent of "a", which is a UCS-4 character. Uses