aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.h
diff options
context:
space:
mode:
authorbfredl <bjorn.linse@gmail.com>2024-08-08 10:42:08 +0200
committerbfredl <bjorn.linse@gmail.com>2024-08-30 11:49:09 +0200
commitcfdf68a7acde16597fbd896674af68c42361102c (patch)
tree6113193fda7a7c0f94577a464e39964e74311583 /src/nvim/mbyte.h
parent4353996d0fa8e5872a334d68196d8088391960cf (diff)
downloadrneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.gz
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.bz2
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.zip
feat(mbyte): support extended grapheme clusters including more emoji
Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible.
Diffstat (limited to 'src/nvim/mbyte.h')
-rw-r--r--src/nvim/mbyte.h32
1 files changed, 12 insertions, 20 deletions
diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h
index 6cbfbcbc3c..2da051fca2 100644
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@@ -3,6 +3,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <sys/types.h> // IWYU pragma: keep
+#include <utf8proc.h>
#include <uv.h> // IWYU pragma: keep
#include "nvim/cmdexpand_defs.h" // IWYU pragma: keep
@@ -11,6 +12,9 @@
#include "nvim/mbyte_defs.h" // IWYU pragma: keep
#include "nvim/types_defs.h" // IWYU pragma: keep
+typedef utf8proc_int32_t GraphemeState;
+#define GRAPHEME_STATE_INIT 0
+
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.h.generated.h"
# include "mbyte.h.inline.generated.h"
@@ -92,28 +96,16 @@ static inline CharInfo utf_ptr2CharInfo(char const *const p_in)
static inline StrCharInfo utfc_next(StrCharInfo cur)
FUNC_ATTR_NONNULL_ALL FUNC_ATTR_ALWAYS_INLINE FUNC_ATTR_PURE
{
- int32_t prev_code = cur.chr.value;
+ // handle ASCII case inline
uint8_t *next = (uint8_t *)(cur.ptr + cur.chr.len);
-
- while (true) {
- if (EXPECT(*next < 0x80U, true)) {
- return (StrCharInfo){
- .ptr = (char *)next,
- .chr = (CharInfo){ .value = *next, .len = 1 },
- };
- }
- uint8_t const next_len = utf8len_tab[*next];
- int32_t const next_code = utf_ptr2CharInfo_impl(next, (uintptr_t)next_len);
- if (!utf_char_composinglike(prev_code, next_code)) {
- return (StrCharInfo){
- .ptr = (char *)next,
- .chr = (CharInfo){ .value = next_code, .len = (next_code < 0 ? 1 : next_len) },
- };
- }
-
- prev_code = next_code;
- next += next_len;
+ if (EXPECT(*next < 0x80U, true)) {
+ return (StrCharInfo){
+ .ptr = (char *)next,
+ .chr = (CharInfo){ .value = *next, .len = 1 },
+ };
}
+
+ return utfc_next_impl(cur);
}
static inline StrCharInfo utf_ptr2StrCharInfo(char *ptr)