aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/regexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/regexp.c')
-rw-r--r--src/nvim/regexp.c88
1 files changed, 58 insertions, 30 deletions
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index 31384e8124..3f34ca7e0e 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -1732,7 +1732,9 @@ static void mb_decompose(int c, int *c1, int *c2, int *c3)
/// Compare two strings, ignore case if rex.reg_ic set.
/// Return 0 if strings match, non-zero otherwise.
-/// Correct the length "*n" when composing characters are ignored.
+/// Correct the length "*n" when composing characters are ignored
+/// or when both utf codepoints are considered equal because of
+/// case-folding but have different length (e.g. 's' and 'ſ')
static int cstrncmp(char *s1, char *s2, int *n)
{
int result;
@@ -1740,8 +1742,27 @@ static int cstrncmp(char *s1, char *s2, int *n)
if (!rex.reg_ic) {
result = strncmp(s1, s2, (size_t)(*n));
} else {
- assert(*n >= 0);
- result = mb_strnicmp(s1, s2, (size_t)(*n));
+ char *p = s1;
+ size_t n2 = 0;
+ int n1 = *n;
+ // count the number of characters for byte-length of s1
+ while (n1 > 0 && *p != NUL) {
+ n1 -= utfc_ptr2len(s1);
+ MB_PTR_ADV(p);
+ n2++;
+ }
+ // count the number of bytes to advance the same number of chars for s2
+ p = s2;
+ while (n2-- > 0 && *p != NUL) {
+ MB_PTR_ADV(p);
+ }
+
+ n2 = (size_t)(p - s2);
+
+ result = utf_strnicmp(s1, s2, (size_t)(*n), n2);
+ if (result == 0 && (int)n2 < *n) {
+ *n = (int)n2;
+ }
}
// if it failed and it's utf8 and we want to combineignore:
@@ -1799,31 +1820,34 @@ static inline char *cstrchr(const char *const s, const int c)
return vim_strchr(s, c);
}
- // Use folded case for UTF-8, slow! For ASCII use libc strpbrk which is
- // expected to be highly optimized.
+ int cc, lc;
if (c > 0x80) {
- const int folded_c = utf_fold(c);
- for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) {
- const int uc = utf_ptr2char(p);
- // Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf, not 0xff.
- if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == folded_c) {
- return (char *)p;
- }
- }
- return NULL;
- }
-
- int cc;
- if (ASCII_ISUPPER(c)) {
+ cc = utf_fold(c);
+ lc = cc;
+ } else if (ASCII_ISUPPER(c)) {
cc = TOLOWER_ASC(c);
+ lc = cc;
} else if (ASCII_ISLOWER(c)) {
cc = TOUPPER_ASC(c);
+ lc = c;
} else {
return vim_strchr(s, c);
}
- char tofind[] = { (char)c, (char)cc, NUL };
- return strpbrk(s, tofind);
+ for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) {
+ const int uc = utf_ptr2char(p);
+ if (c > 0x80 || uc > 0x80) {
+ // Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf, not 0xff.
+ // Compare with lower case of the character.
+ if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == lc) {
+ return (char *)p;
+ }
+ } else if ((uint8_t)(*p) == c || (uint8_t)(*p) == cc) {
+ return (char *)p;
+ }
+ }
+
+ return NULL;
}
////////////////////////////////////////////////////////////////
@@ -6619,11 +6643,9 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
}
}
} else {
- for (i = 0; i < len; i++) {
- if (opnd[i] != rex.input[i]) {
- status = RA_NOMATCH;
- break;
- }
+ if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) {
+ status = RA_NOMATCH;
+ break;
}
}
rex.input += len;
@@ -13984,19 +14006,25 @@ static int skip_to_start(int c, colnr_T *colp)
static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
{
colnr_T col = *startcol;
- const int regstart_len = utf_ptr2len((char *)rex.line + col);
+ const int regstart_len = utf_char2len(regstart);
while (true) {
bool match = true;
uint8_t *s1 = match_text;
- uint8_t *s2 = rex.line + col + regstart_len; // skip regstart
+ // skip regstart
+ int regstart_len2 = regstart_len;
+ if (regstart_len2 > 1 && utf_ptr2len((char *)rex.line + col) != regstart_len2) {
+ // because of case-folding of the previously matched text, we may need
+ // to skip fewer bytes than utf_char2len(regstart)
+ regstart_len2 = utf_char2len(utf_fold(regstart));
+ }
+ uint8_t *s2 = rex.line + col + regstart_len2;
while (*s1) {
int c1_len = utf_ptr2len((char *)s1);
int c1 = utf_ptr2char((char *)s1);
int c2_len = utf_ptr2len((char *)s2);
int c2 = utf_ptr2char((char *)s2);
- if ((c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
- || c1_len != c2_len) {
+ if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) {
match = false;
break;
}
@@ -15664,7 +15692,7 @@ static int nfa_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int
// If match_text is set it contains the full text that must match.
// Nothing else to try. Doesn't handle combining chars well.
- if (prog->match_text != NULL && !rex.reg_icombine) {
+ if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) {
retval = find_match_text(&col, prog->regstart, prog->match_text);
if (REG_MULTI) {
rex.reg_mmatch->rmm_matchcol = col;