aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/regexp.c
diff options
context:
space:
mode:
authorbfredl <bjorn.linse@gmail.com>2024-08-08 10:42:08 +0200
committerbfredl <bjorn.linse@gmail.com>2024-08-30 11:49:09 +0200
commitcfdf68a7acde16597fbd896674af68c42361102c (patch)
tree6113193fda7a7c0f94577a464e39964e74311583 /src/nvim/regexp.c
parent4353996d0fa8e5872a334d68196d8088391960cf (diff)
downloadrneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.gz
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.tar.bz2
rneovim-cfdf68a7acde16597fbd896674af68c42361102c.zip
feat(mbyte): support extended grapheme clusters including more emoji
Use the grapheme break algorithm from utf8proc to support grapheme clusters from recent unicode versions. Handle variant selector VS16 turning some codepoints into double-width emoji. This means we need to use ptr2cells rather than char2cells when possible.
Diffstat (limited to 'src/nvim/regexp.c')
-rw-r--r--src/nvim/regexp.c29
1 files changed, 15 insertions, 14 deletions
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index 7dbbb19545..c91c112c3c 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -3031,7 +3031,7 @@ static bool use_multibytecode(int c)
{
return utf_char2len(c) > 1
&& (re_multi_type(peekchr()) != NOT_MULTI
- || utf_iscomposing(c));
+ || utf_iscomposing_legacy(c));
}
// Emit (if appropriate) a byte of code
@@ -4326,7 +4326,7 @@ static uint8_t *regatom(int *flagp)
}
// When '.' is followed by a composing char ignore the dot, so that
// the composing char is matched here.
- if (c == Magic('.') && utf_iscomposing(peekchr())) {
+ if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
c = getchr();
goto do_multibyte;
}
@@ -5001,9 +5001,10 @@ do_multibyte:
int l;
// Need to get composing character too.
+ GraphemeState state = GRAPHEME_STATE_INIT;
while (true) {
l = utf_ptr2len(regparse);
- if (!utf_composinglike(regparse, regparse + l)) {
+ if (!utf_composinglike(regparse, regparse + l, &state)) {
break;
}
regmbc(utf_ptr2char(regparse));
@@ -6569,7 +6570,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
// Check for following composing character, unless %C
// follows (skips over all composing chars).
if (status != RA_NOMATCH
- && utf_composinglike((char *)rex.input, (char *)rex.input + len)
+ && utf_composinglike((char *)rex.input, (char *)rex.input + len, NULL)
&& !rex.reg_icombine
&& OP(next) != RE_COMPOSING) {
// raaron: This code makes a composing character get
@@ -6624,14 +6625,14 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
break;
}
const int opndc = utf_ptr2char((char *)opnd);
- if (utf_iscomposing(opndc)) {
+ if (utf_iscomposing_legacy(opndc)) {
// When only a composing char is given match at any
// position where that composing char appears.
status = RA_NOMATCH;
for (i = 0; rex.input[i] != NUL;
i += utf_ptr2len((char *)rex.input + i)) {
const int inpc = utf_ptr2char((char *)rex.input + i);
- if (!utf_iscomposing(inpc)) {
+ if (!utf_iscomposing_legacy(inpc)) {
if (i > 0) {
break;
}
@@ -6654,7 +6655,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
case RE_COMPOSING:
// Skip composing characters.
- while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
+ while (utf_iscomposing_legacy(utf_ptr2char((char *)rex.input))) {
rex.input += utf_ptr2len((char *)rex.input);
}
break;
@@ -10070,7 +10071,7 @@ static int nfa_regatom(void)
}
// When '.' is followed by a composing char ignore the dot, so that
// the composing char is matched here.
- if (c == Magic('.') && utf_iscomposing(peekchr())) {
+ if (c == Magic('.') && utf_iscomposing_legacy(peekchr())) {
old_regparse = (uint8_t *)regparse;
c = getchr();
goto nfa_do_multibyte;
@@ -10705,7 +10706,7 @@ collection:
nfa_do_multibyte:
// plen is length of current char with composing chars
if (utf_char2len(c) != (plen = utfc_ptr2len((char *)old_regparse))
- || utf_iscomposing(c)) {
+ || utf_iscomposing_legacy(c)) {
int i = 0;
// A base character plus composing characters, or just one
@@ -14033,7 +14034,7 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
}
if (match
// check that no composing char follows
- && !utf_iscomposing(utf_ptr2char((char *)s2))) {
+ && !utf_iscomposing_legacy(utf_ptr2char((char *)s2))) {
cleanup_subexpr();
if (REG_MULTI) {
rex.reg_startpos[0].lnum = rex.lnum;
@@ -14278,7 +14279,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
// is not really a match.
if (!rex.reg_icombine
&& rex.input != rex.line
- && utf_iscomposing(curc)) {
+ && utf_iscomposing_legacy(curc)) {
break;
}
nfa_match = true;
@@ -14622,7 +14623,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
sta = t->state->out;
len = 0;
- if (utf_iscomposing(sta->c)) {
+ if (utf_iscomposing_legacy(sta->c)) {
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
@@ -14724,7 +14725,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
int j;
sta = t->state->out->out;
- if (utf_iscomposing(sta->c)) {
+ if (utf_iscomposing_legacy(sta->c)) {
// Only match composing character(s), ignore base
// character. Used for ".{composing}" and "{composing}"
// (no preceding character).
@@ -14846,7 +14847,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
case NFA_ANY_COMPOSING:
// On a composing character skip over it. Otherwise do
// nothing. Always matches.
- if (utf_iscomposing(curc)) {
+ if (utf_iscomposing_legacy(curc)) {
add_off = clen;
} else {
add_here = true;