4 files changed, 93 insertions, 31 deletions
diff --git a/runtime/doc/dev_vimpatch.txt b/runtime/doc/dev_vimpatch.txt
index 6d450424c5..98a4246057 100644
--- a/runtime/doc/dev_vimpatch.txt
+++ b/runtime/doc/dev_vimpatch.txt
@@ -204,6 +204,8 @@ information.
   mb_ptr2char                                          utf_ptr2char
   mb_head_off                                          utf_head_off
   mb_tail_off                                          utf_cp_bounds
+  mb_strnicmp2                                         utf_strnicmp
+  MB_STRNICMP2                                         utf_strnicmp
   mb_lefthalve                                        grid_lefthalve
   mb_fix_col                                           grid_fix_col
   utf_off2cells                                       grid_off2cells
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index a544113a7b..c6cefb8a91 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1384,7 +1384,7 @@ bool mb_isalpha(int a)
   return mb_islower(a) || mb_isupper(a);
 }
 
-static int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
+int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
 {
   int c1, c2;
   char buffer[6];
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index 31384e8124..3f34ca7e0e 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -1732,7 +1732,9 @@ static void mb_decompose(int c, int *c1, int *c2, int *c3)
 
 /// Compare two strings, ignore case if rex.reg_ic set.
 /// Return 0 if strings match, non-zero otherwise.
-/// Correct the length "*n" when composing characters are ignored.
+/// Correct the length "*n" when composing characters are ignored
+/// or when both utf codepoints are considered equal because of
+/// case-folding but have different length (e.g. 's' and 'ſ')
 static int cstrncmp(char *s1, char *s2, int *n)
 {
   int result;
@@ -1740,8 +1742,27 @@ static int cstrncmp(char *s1, char *s2, int *n)
   if (!rex.reg_ic) {
     result = strncmp(s1, s2, (size_t)(*n));
   } else {
-    assert(*n >= 0);
-    result = mb_strnicmp(s1, s2, (size_t)(*n));
+    char *p = s1;
+    size_t n2 = 0;
+    int n1 = *n;
+    // count the number of characters for byte-length of s1
+    while (n1 > 0 && *p != NUL) {
+      n1 -= utfc_ptr2len(s1);
+      MB_PTR_ADV(p);
+      n2++;
+    }
+    // count the number of bytes to advance the same number of chars for s2
+    p = s2;
+    while (n2-- > 0 && *p != NUL) {
+      MB_PTR_ADV(p);
+    }
+
+    n2 = (size_t)(p - s2);
+
+    result = utf_strnicmp(s1, s2, (size_t)(*n), n2);
+    if (result == 0 && (int)n2 < *n) {
+      *n = (int)n2;
+    }
   }
 
   // if it failed and it's utf8 and we want to combineignore:
@@ -1799,31 +1820,34 @@ static inline char *cstrchr(const char *const s, const int c)
     return vim_strchr(s, c);
   }
 
-  // Use folded case for UTF-8, slow! For ASCII use libc strpbrk which is
-  // expected to be highly optimized.
+  int cc, lc;
   if (c > 0x80) {
-    const int folded_c = utf_fold(c);
-    for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) {
-      const int uc = utf_ptr2char(p);
-      // Do not match an illegal byte.  E.g. 0xff matches 0xc3 0xbf, not 0xff.
-      if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == folded_c) {
-        return (char *)p;
-      }
-    }
-    return NULL;
-  }
-
-  int cc;
-  if (ASCII_ISUPPER(c)) {
+    cc = utf_fold(c);
+    lc = cc;
+  } else if (ASCII_ISUPPER(c)) {
     cc = TOLOWER_ASC(c);
+    lc = cc;
   } else if (ASCII_ISLOWER(c)) {
     cc = TOUPPER_ASC(c);
+    lc = c;
   } else {
     return vim_strchr(s, c);
   }
 
-  char tofind[] = { (char)c, (char)cc, NUL };
-  return strpbrk(s, tofind);
+  for (const char *p = s; *p != NUL; p += utfc_ptr2len(p)) {
+    const int uc = utf_ptr2char(p);
+    if (c > 0x80 || uc > 0x80) {
+      // Do not match an illegal byte.  E.g. 0xff matches 0xc3 0xbf, not 0xff.
+      // Compare with lower case of the character.
+      if ((uc < 0x80 || uc != (uint8_t)(*p)) && utf_fold(uc) == lc) {
+        return (char *)p;
+      }
+    } else if ((uint8_t)(*p) == c || (uint8_t)(*p) == cc) {
+      return (char *)p;
+    }
+  }
+
+  return NULL;
 }
 
 ////////////////////////////////////////////////////////////////
@@ -6619,11 +6643,9 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
               }
             }
           } else {
-            for (i = 0; i < len; i++) {
-              if (opnd[i] != rex.input[i]) {
-                status = RA_NOMATCH;
-                break;
-              }
+            if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) {
+              status = RA_NOMATCH;
+              break;
             }
           }
           rex.input += len;
@@ -13984,19 +14006,25 @@ static int skip_to_start(int c, colnr_T *colp)
 static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text)
 {
   colnr_T col = *startcol;
-  const int regstart_len = utf_ptr2len((char *)rex.line + col);
+  const int regstart_len = utf_char2len(regstart);
 
   while (true) {
     bool match = true;
     uint8_t *s1 = match_text;
-    uint8_t *s2 = rex.line + col + regstart_len;  // skip regstart
+    // skip regstart
+    int regstart_len2 = regstart_len;
+    if (regstart_len2 > 1 && utf_ptr2len((char *)rex.line + col) != regstart_len2) {
+      // because of case-folding of the previously matched text, we may need
+      // to skip fewer bytes than utf_char2len(regstart)
+      regstart_len2 = utf_char2len(utf_fold(regstart));
+    }
+    uint8_t *s2 = rex.line + col + regstart_len2;
     while (*s1) {
       int c1_len = utf_ptr2len((char *)s1);
       int c1 = utf_ptr2char((char *)s1);
       int c2_len = utf_ptr2len((char *)s2);
       int c2 = utf_ptr2char((char *)s2);
-      if ((c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2)))
-          || c1_len != c2_len) {
+      if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) {
         match = false;
         break;
       }
@@ -15664,7 +15692,7 @@ static int nfa_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int
 
     // If match_text is set it contains the full text that must match.
     // Nothing else to try. Doesn't handle combining chars well.
-    if (prog->match_text != NULL && !rex.reg_icombine) {
+    if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) {
       retval = find_match_text(&col, prog->regstart, prog->match_text);
       if (REG_MULTI) {
         rex.reg_mmatch->rmm_matchcol = col;
diff --git a/test/old/testdir/test_regexp_utf8.vim b/test/old/testdir/test_regexp_utf8.vim
index 07695f9cc8..505e99919c 100644
--- a/test/old/testdir/test_regexp_utf8.vim
+++ b/test/old/testdir/test_regexp_utf8.vim
@@ -594,4 +594,36 @@ func Test_combining_chars_in_collection()
   bw!
 endfunc
 
+func Test_search_multibyte_match_ascii()
+  new
+  " Match single 'ſ' and 's'
+  call setline(1,  'das abc heraus abc ſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+  endfor
+  " Match several 'ſſ' and 'ss'
+  call setline(1,  'das abc herauss abc ſſich abc ſind')
+  for i in range(0, 2)
+    exe "set re="..i
+    let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+    let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+    let ic_match3 = matchbufline('%', '\c[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+    let noic_match3 = matchbufline('%', '\C[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+
+    call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re)
+    call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match3, "Ignorecase Collection Regex-engine: " .. &re)
+    call assert_equal(['ſſ','ſ'], noic_match3, "No-Ignorecase Collection Regex-engine: " .. &re)
+  endfor
+  bw!
+endfunc
+
 " vim: shiftwidth=2 sts=2 expandtab