diff options
-rw-r--r-- | runtime/doc/dev_vimpatch.txt | 2 | ||||
-rw-r--r-- | src/nvim/mbyte.c | 2 | ||||
-rw-r--r-- | src/nvim/regexp.c | 88 | ||||
-rw-r--r-- | test/old/testdir/test_regexp_utf8.vim | 32 |
4 files changed, 93 insertions, 31 deletions
diff --git a/runtime/doc/dev_vimpatch.txt b/runtime/doc/dev_vimpatch.txt index 6d450424c5..98a4246057 100644 --- a/runtime/doc/dev_vimpatch.txt +++ b/runtime/doc/dev_vimpatch.txt @@ -204,6 +204,8 @@ information. mb_ptr2char utf_ptr2char mb_head_off utf_head_off mb_tail_off utf_cp_bounds + mb_strnicmp2 utf_strnicmp + MB_STRNICMP2 utf_strnicmp mb_lefthalve grid_lefthalve mb_fix_col grid_fix_col utf_off2cells grid_off2cells diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index a544113a7b..c6cefb8a91 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -1384,7 +1384,7 @@ bool mb_isalpha(int a) return mb_islower(a) || mb_isupper(a); } -static int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2) +int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2) { int c1, c2; char buffer[6]; diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 31384e8124..3f34ca7e0e 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -1732,7 +1732,9 @@ static void mb_decompose(int c, int *c1, int *c2, int *c3) /// Compare two strings, ignore case if rex.reg_ic set. /// Return 0 if strings match, non-zero otherwise. -/// Correct the length "*n" when composing characters are ignored. +/// Correct the length "*n" when composing characters are ignored +/// or when both utf codepoints are considered equal because of +/// case-folding but have different length (e.g. 's' and 'ſ') static int cstrncmp(char *s1, char *s2, int *n) { int result; @@ -1740,8 +1742,27 @@ static int cstrncmp(char *s1, char *s2, int *n) if (!rex.reg_ic) { result = strncmp(s1, s2, (size_t)(*n)); } else { - assert(*n >= 0); - result = mb_strnicmp(s1, s2, (size_t)(*n)); + char *p = s1; + size_t n2 = 0; + int n1 = *n; + // count the number of characters for byte-length of s1 + while (n1 > 0 && *p != NUL) { + n1 -= utfc_ptr2len(s1); + MB_PTR_ADV(p); + n2++; + } + // count the number of bytes to advance the same number of chars for s2 + p = s2; + while (n2-- > 0 && *p != NUL) { + MB_PTR_ADV(p); + } + + n2 = (size_t)(p - s2); + + result = utf_strnicmp(s1, s2, (size_t)(*n), n2); + if (result == 0 && (int)n2 < *n) { + *n = (int)n2; + } } // if it failed and it's utf8 and we want to combineignore: @@ -1799,31 +1820,34 @@ static inline char *cstrchr(const char *const s, const int c) return vim_strchr(s, c); } - // Use folded case for UTF-8, slow! For ASCII use libc strpbrk which is - // expected to be highly optimized. + int cc, lc; if (c > 0x80) { - const int folded_c = utf_fold(c); - for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) { - const int uc = utf_ptr2char(p); - // Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf, not 0xff. - if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == folded_c) { - return (char *)p; - } - } - return NULL; - } - - int cc; - if (ASCII_ISUPPER(c)) { + cc = utf_fold(c); + lc = cc; + } else if (ASCII_ISUPPER(c)) { cc = TOLOWER_ASC(c); + lc = cc; } else if (ASCII_ISLOWER(c)) { cc = TOUPPER_ASC(c); + lc = c; } else { return vim_strchr(s, c); } - char tofind[] = { (char)c, (char)cc, NUL }; - return strpbrk(s, tofind); + for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) { + const int uc = utf_ptr2char(p); + if (c > 0x80 || uc > 0x80) { + // Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf, not 0xff. + // Compare with lower case of the character. + if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == lc) { + return (char *)p; + } + } else if ((uint8_t)(*p) == c || (uint8_t)(*p) == cc) { + return (char *)p; + } + } + + return NULL; } //////////////////////////////////////////////////////////////// @@ -6619,11 +6643,9 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) } } } else { - for (i = 0; i < len; i++) { - if (opnd[i] != rex.input[i]) { - status = RA_NOMATCH; - break; - } + if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) { + status = RA_NOMATCH; + break; } } rex.input += len; @@ -13984,19 +14006,25 @@ static int skip_to_start(int c, colnr_T *colp) static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) { colnr_T col = *startcol; - const int regstart_len = utf_ptr2len((char *)rex.line + col); + const int regstart_len = utf_char2len(regstart); while (true) { bool match = true; uint8_t *s1 = match_text; - uint8_t *s2 = rex.line + col + regstart_len; // skip regstart + // skip regstart + int regstart_len2 = regstart_len; + if (regstart_len2 > 1 && utf_ptr2len((char *)rex.line + col) != regstart_len2) { + // because of case-folding of the previously matched text, we may need + // to skip fewer bytes than utf_char2len(regstart) + regstart_len2 = utf_char2len(utf_fold(regstart)); + } + uint8_t *s2 = rex.line + col + regstart_len2; while (*s1) { int c1_len = utf_ptr2len((char *)s1); int c1 = utf_ptr2char((char *)s1); int c2_len = utf_ptr2len((char *)s2); int c2 = utf_ptr2char((char *)s2); - if ((c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) - || c1_len != c2_len) { + if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) { match = false; break; } @@ -15664,7 +15692,7 @@ static int nfa_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int // If match_text is set it contains the full text that must match. // Nothing else to try. Doesn't handle combining chars well. - if (prog->match_text != NULL && !rex.reg_icombine) { + if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) { retval = find_match_text(&col, prog->regstart, prog->match_text); if (REG_MULTI) { rex.reg_mmatch->rmm_matchcol = col; diff --git a/test/old/testdir/test_regexp_utf8.vim b/test/old/testdir/test_regexp_utf8.vim index 07695f9cc8..505e99919c 100644 --- a/test/old/testdir/test_regexp_utf8.vim +++ b/test/old/testdir/test_regexp_utf8.vim @@ -594,4 +594,36 @@ func Test_combining_chars_in_collection() bw! endfunc +func Test_search_multibyte_match_ascii() + new + " Match single 'ſ' and 's' + call setline(1, 'das abc heraus abc ſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + endfor + " Match several 'ſſ' and 'ss' + call setline(1, 'das abc herauss abc ſſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + let ic_match3 = matchbufline('%', '\c[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match3 = matchbufline('%', '\C[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text}) + + call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re) + call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match3, "Ignorecase Collection Regex-engine: " .. &re) + call assert_equal(['ſſ','ſ'], noic_match3, "No-Ignorecase Collection Regex-engine: " .. &re) + endfor + bw! +endfunc + " vim: shiftwidth=2 sts=2 expandtab |