1 files changed, 273 insertions, 45 deletions
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index 3536196a3b..86082adbb6 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -13,7 +13,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
-#include <sys/types.h>
+#include <uv.h>
 
 #include "nvim/ascii_defs.h"
 #include "nvim/buffer_defs.h"
@@ -21,14 +21,16 @@
 #include "nvim/eval.h"
 #include "nvim/eval/typval.h"
 #include "nvim/eval/userfunc.h"
-#include "nvim/func_attr.h"
 #include "nvim/garray.h"
-#include "nvim/gettext.h"
+#include "nvim/garray_defs.h"
+#include "nvim/gettext_defs.h"
 #include "nvim/globals.h"
 #include "nvim/keycodes.h"
 #include "nvim/macros_defs.h"
 #include "nvim/mark.h"
+#include "nvim/mark_defs.h"
 #include "nvim/mbyte.h"
+#include "nvim/mbyte_defs.h"
 #include "nvim/memline.h"
 #include "nvim/memory.h"
 #include "nvim/message.h"
@@ -43,6 +45,106 @@
 #include "nvim/types_defs.h"
 #include "nvim/vim_defs.h"
 
+enum {
+  /// In the NFA engine: how many braces are allowed.
+  /// TODO(RE): Use dynamic memory allocation instead of static, like here
+  NFA_MAX_BRACES = 20,
+};
+
+enum {
+  /// In the NFA engine: how many states are allowed.
+  NFA_MAX_STATES = 100000,
+  NFA_TOO_EXPENSIVE = -1,
+};
+
+/// Which regexp engine to use? Needed for vim_regcomp().
+/// Must match with 'regexpengine'.
+enum {
+  AUTOMATIC_ENGINE    = 0,
+  BACKTRACKING_ENGINE = 1,
+  NFA_ENGINE          = 2,
+};
+
+/// Structure returned by vim_regcomp() to pass on to vim_regexec().
+/// This is the general structure. For the actual matcher, two specific
+/// structures are used. See code below.
+struct regprog {
+  regengine_T *engine;
+  unsigned regflags;
+  unsigned re_engine;  ///< Automatic, backtracking or NFA engine.
+  unsigned re_flags;   ///< Second argument for vim_regcomp().
+  bool re_in_use;      ///< prog is being executed
+};
+
+/// Structure used by the back track matcher.
+/// These fields are only to be used in regexp.c!
+/// See regexp.c for an explanation.
+typedef struct {
+  // These four members implement regprog_T.
+  regengine_T *engine;
+  unsigned regflags;
+  unsigned re_engine;
+  unsigned re_flags;
+  bool re_in_use;
+
+  int regstart;
+  uint8_t reganch;
+  uint8_t *regmust;
+  int regmlen;
+  uint8_t reghasz;
+  uint8_t program[];
+} bt_regprog_T;
+
+/// Structure representing a NFA state.
+/// An NFA state may have no outgoing edge, when it is a NFA_MATCH state.
+typedef struct nfa_state nfa_state_T;
+struct nfa_state {
+  int c;
+  nfa_state_T *out;
+  nfa_state_T *out1;
+  int id;
+  int lastlist[2];  ///< 0: normal, 1: recursive
+  int val;
+};
+
+/// Structure used by the NFA matcher.
+typedef struct {
+  // These four members implement regprog_T.
+  regengine_T *engine;
+  unsigned regflags;
+  unsigned re_engine;
+  unsigned re_flags;
+  bool re_in_use;
+
+  nfa_state_T *start;   ///< points into state[]
+
+  int reganch;          ///< pattern starts with ^
+  int regstart;         ///< char at start of pattern
+  uint8_t *match_text;  ///< plain text to match with
+
+  int has_zend;         ///< pattern contains \ze
+  int has_backref;      ///< pattern contains \1 .. \9
+  int reghasz;
+  char *pattern;
+  int nsubexp;          ///< number of ()
+  int nstate;
+  nfa_state_T state[];
+} nfa_regprog_T;
+
+struct regengine {
+  /// bt_regcomp or nfa_regcomp
+  regprog_T *(*regcomp)(uint8_t *, int);
+  /// bt_regfree or nfa_regfree
+  void (*regfree)(regprog_T *);
+  /// bt_regexec_nl or nfa_regexec_nl
+  int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool);
+  /// bt_regexec_mult or nfa_regexec_mult
+  int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *);
+#ifdef REGEXP_DEBUG
+  uint8_t *expr;
+#endif
+};
+
 // Structure used to save the current input state, when it needs to be
 // restored after trying a match.  Used by reg_save() and reg_restore().
 // Also stores the length of "backpos".
@@ -1263,6 +1365,10 @@ static bool reg_match_visual(void)
       top = curbuf->b_visual.vi_end;
       bot = curbuf->b_visual.vi_start;
     }
+    // a substitute command may have removed some lines
+    if (bot.lnum > curbuf->b_ml.ml_line_count) {
+      bot.lnum = curbuf->b_ml.ml_line_count;
+    }
     mode = curbuf->b_visual.vi_mode;
     curswant = curbuf->b_visual.vi_curswant;
   }
@@ -5363,9 +5469,9 @@ static bool reg_save_equal(const regsave_T *save)
 
 // After a failed match restore the sub-expressions.
 #define restore_se(savep, posp, pp) { \
-  if (REG_MULTI)  /* NOLINT(readability/braces) */ \
+  if (REG_MULTI) \
   *(posp) = (savep)->se_u.pos; \
-  else  /* NOLINT */ \
+  else \
   *(pp) = (savep)->se_u.ptr; }
 
 // Tentatively set the sub-expression start to the current position (after
@@ -5866,8 +5972,8 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
 
 #ifdef REGEXP_DEBUG
     if (scan != NULL && regnarrate) {
-      os_errmsg((char *)regprop(scan));
-      os_errmsg("(\n");
+      fprintf(stderr, "%s", (char *)regprop(scan));
+      fprintf(stderr, "%s", "(\n");
     }
 #endif
 
@@ -5893,18 +5999,18 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
 
 #ifdef REGEXP_DEBUG
       if (regnarrate) {
-        os_errmsg((char *)regprop(scan));
-        os_errmsg("...\n");
+        fprintf(stderr, "%s", (char *)regprop(scan));
+        fprintf(stderr, "%s", "...\n");
         if (re_extmatch_in != NULL) {
           int i;
 
-          os_errmsg(_("External submatches:\n"));
+          fprintf(stderr, _("External submatches:\n"));
           for (i = 0; i < NSUBEXP; i++) {
-            os_errmsg("    \"");
+            fprintf(stderr, "%s", "    \"");
             if (re_extmatch_in->matches[i] != NULL) {
-              os_errmsg((char *)re_extmatch_in->matches[i]);
+              fprintf(stderr, "%s", (char *)re_extmatch_in->matches[i]);
             }
-            os_errmsg("\"\n");
+            fprintf(stderr, "%s", "\"\n");
           }
         }
       }
@@ -6326,15 +6432,33 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
         break;
 
         case ANYOF:
-        case ANYBUT:
+        case ANYBUT: {
+          uint8_t *q = OPERAND(scan);
+
           if (c == NUL) {
             status = RA_NOMATCH;
-          } else if ((cstrchr((char *)OPERAND(scan), c) == NULL) == (op == ANYOF)) {
+          } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) {
             status = RA_NOMATCH;
-          } else {
-            ADVANCE_REGINPUT();
+          } else {  // Check following combining characters
+            int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q);
+
+            rex.input += utf_ptr2len((char *)rex.input);
+            q += utf_ptr2len((char *)q);
+
+            if (len == 0) {
+              break;
+            }
+
+            for (int i = 0; i < len; i++) {
+              if (q[i] != rex.input[i]) {
+                status = RA_NOMATCH;
+                break;
+              }
+            }
+            rex.input += len;
           }
           break;
+        }
 
         case MULTIBYTECODE: {
           int i, len;
@@ -6380,7 +6504,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
         case RE_COMPOSING:
           // Skip composing characters.
           while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
-            MB_CPTR_ADV(rex.input);
+            rex.input += utf_ptr2len((char *)rex.input);
           }
           break;
 
@@ -8688,9 +8812,9 @@ static void nfa_emit_equi_class(int c)
     case 0x1eb2:
     case 0x1eb4:
     case 0x1eb6:
-      EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)  // NOLINT(whitespace/cast)
-      EMIT2(A_circumflex) EMIT2(A_virguilla)    // NOLINT(whitespace/cast)
-      EMIT2(A_diaeresis) EMIT2(A_ring)          // NOLINT(whitespace/cast)
+      EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
+      EMIT2(A_circumflex) EMIT2(A_virguilla)
+      EMIT2(A_diaeresis) EMIT2(A_ring)
       EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
       EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
       EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
@@ -8769,8 +8893,8 @@ static void nfa_emit_equi_class(int c)
     case 0x1ec2:
     case 0x1ec4:
     case 0x1ec6:
-      EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)  // NOLINT(whitespace/cast)
-      EMIT2(E_circumflex) EMIT2(E_diaeresis)    // NOLINT(whitespace/cast)
+      EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
+      EMIT2(E_circumflex) EMIT2(E_diaeresis)
       EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
       EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
       EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
@@ -8838,8 +8962,8 @@ static void nfa_emit_equi_class(int c)
     case 0x1e2e:
     case 0x1ec8:
     case 0x1eca:
-      EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)  // NOLINT(whitespace/cast)
-      EMIT2(I_circumflex) EMIT2(I_diaeresis)    // NOLINT(whitespace/cast)
+      EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
+      EMIT2(I_circumflex) EMIT2(I_diaeresis)
       EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
       EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
       EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
@@ -8948,9 +9072,9 @@ static void nfa_emit_equi_class(int c)
     case 0x1ede:
     case 0x1ee0:
     case 0x1ee2:
-      EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)  // NOLINT(whitespace/cast)
-      EMIT2(O_circumflex) EMIT2(O_virguilla)    // NOLINT(whitespace/cast)
-      EMIT2(O_diaeresis) EMIT2(O_slash)         // NOLINT(whitespace/cast)
+      EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
+      EMIT2(O_circumflex) EMIT2(O_virguilla)
+      EMIT2(O_diaeresis) EMIT2(O_slash)
       EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
       EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
       EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
@@ -9065,8 +9189,8 @@ static void nfa_emit_equi_class(int c)
     case 0x1eec:
     case 0x1eee:
     case 0x1ef0:
-      EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)  // NOLINT(whitespace/cast)
-      EMIT2(U_diaeresis) EMIT2(U_circumflex)    // NOLINT(whitespace/cast)
+      EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
+      EMIT2(U_diaeresis) EMIT2(U_circumflex)
       EMIT2(0x168) EMIT2(0x16a)
       EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
       EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
@@ -9169,9 +9293,9 @@ static void nfa_emit_equi_class(int c)
     case 0x1eb5:
     case 0x1eb7:
     case 0x2c65:
-      EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)  // NOLINT(whitespace/cast)
-      EMIT2(a_circumflex) EMIT2(a_virguilla)    // NOLINT(whitespace/cast)
-      EMIT2(a_diaeresis) EMIT2(a_ring)          // NOLINT(whitespace/cast)
+      EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
+      EMIT2(a_circumflex) EMIT2(a_virguilla)
+      EMIT2(a_diaeresis) EMIT2(a_ring)
       EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
       EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
       EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
@@ -9258,8 +9382,8 @@ static void nfa_emit_equi_class(int c)
     case 0x1ec3:
     case 0x1ec5:
     case 0x1ec7:
-      EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)  // NOLINT(whitespace/cast)
-      EMIT2(e_circumflex) EMIT2(e_diaeresis)    // NOLINT(whitespace/cast)
+      EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
+      EMIT2(e_circumflex) EMIT2(e_diaeresis)
       EMIT2(0x113) EMIT2(0x115)
       EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
       EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
@@ -9334,8 +9458,8 @@ static void nfa_emit_equi_class(int c)
     case 0x1e2f:
     case 0x1ec9:
     case 0x1ecb:
-      EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)  // NOLINT(whitespace/cast)
-      EMIT2(i_circumflex) EMIT2(i_diaeresis)    // NOLINT(whitespace/cast)
+      EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
+      EMIT2(i_circumflex) EMIT2(i_diaeresis)
       EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
       EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
       EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
@@ -9451,9 +9575,9 @@ static void nfa_emit_equi_class(int c)
     case 0x1edf:
     case 0x1ee1:
     case 0x1ee3:
-      EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)  // NOLINT(whitespace/cast)
-      EMIT2(o_circumflex) EMIT2(o_virguilla)    // NOLINT(whitespace/cast)
-      EMIT2(o_diaeresis) EMIT2(o_slash)         // NOLINT(whitespace/cast)
+      EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
+      EMIT2(o_circumflex) EMIT2(o_virguilla)
+      EMIT2(o_diaeresis) EMIT2(o_slash)
       EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
       EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
       EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
@@ -9582,8 +9706,8 @@ static void nfa_emit_equi_class(int c)
     case 0x1eed:
     case 0x1eef:
     case 0x1ef1:
-      EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)  // NOLINT(whitespace/cast)
-      EMIT2(u_circumflex) EMIT2(u_diaeresis)    // NOLINT(whitespace/cast)
+      EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
+      EMIT2(u_circumflex) EMIT2(u_diaeresis)
       EMIT2(0x169) EMIT2(0x16b)
       EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
       EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
@@ -9636,7 +9760,7 @@ static void nfa_emit_equi_class(int c)
     case 0x1ef5:
     case 0x1ef7:
     case 0x1ef9:
-      EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)  // NOLINT(whitespace/cast)
+      EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
       EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
       EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
       EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
@@ -9841,7 +9965,7 @@ static int nfa_regatom(void)
       emsg(_(e_nopresub));
       return FAIL;
     }
-    for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp)) {
+    for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; lp += utf_ptr2len((char *)lp)) {
       EMIT(utf_ptr2char((char *)lp));
       if (lp != (uint8_t *)reg_prev_sub) {
         EMIT(NFA_CONCAT);
@@ -10348,13 +10472,39 @@ collection:
           } else {
             if (got_coll_char == true && startc == 0) {
               EMIT(0x0a);
+              EMIT(NFA_CONCAT);
             } else {
               EMIT(startc);
+              if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) {
+                EMIT(NFA_CONCAT);
+              }
             }
-            EMIT(NFA_CONCAT);
           }
         }
 
+        int plen;
+        if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) {
+          int i = utf_ptr2len(regparse);
+
+          c = utf_ptr2char(regparse + i);
+
+          // Add composing characters
+          while (true) {
+            if (c == 0) {
+              // \x00 is translated to \x0a, start at \x01.
+              EMIT(1);
+            } else {
+              EMIT(c);
+            }
+            EMIT(NFA_CONCAT);
+            if ((i += utf_char2len(c)) >= plen) {
+              break;
+            }
+            c = utf_ptr2char(regparse + i);
+          }
+          EMIT(NFA_COMPOSING);
+          EMIT(NFA_CONCAT);
+        }
         MB_PTR_ADV(regparse);
       }           // while (p < endp)
 
@@ -14403,6 +14553,78 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
         state = t->state->out;
         result_if_matched = (t->state->c == NFA_START_COLL);
         while (true) {
+          if (state->c == NFA_COMPOSING) {
+            int mc = curc;
+            int len = 0;
+            nfa_state_T *end;
+            nfa_state_T *sta;
+            int cchars[MAX_MCO];
+            int ccount = 0;
+            int j;
+
+            sta = t->state->out->out;
+            if (utf_iscomposing(sta->c)) {
+              // Only match composing character(s), ignore base
+              // character.  Used for ".{composing}" and "{composing}"
+              // (no preceding character).
+              len += utf_char2len(mc);
+            }
+            if (rex.reg_icombine && len == 0) {
+              // If \Z was present, then ignore composing characters.
+              // When ignoring the base character this always matches.
+              if (sta->c != curc) {
+                result = FAIL;
+              } else {
+                result = OK;
+              }
+              while (sta->c != NFA_END_COMPOSING) {
+                sta = sta->out;
+              }
+            }
+            // Check base character matches first, unless ignored.
+            else if (len > 0 || mc == sta->c) {
+              if (len == 0) {
+                len += utf_char2len(mc);
+                sta = sta->out;
+              }
+
+              // We don't care about the order of composing characters.
+              // Get them into cchars[] first.
+              while (len < clen) {
+                mc = utf_ptr2char((char *)rex.input + len);
+                cchars[ccount++] = mc;
+                len += utf_char2len(mc);
+                if (ccount == MAX_MCO) {
+                  break;
+                }
+              }
+
+              // Check that each composing char in the pattern matches a
+              // composing char in the text.  We do not check if all
+              // composing chars are matched.
+              result = OK;
+              while (sta->c != NFA_END_COMPOSING) {
+                for (j = 0; j < ccount; j++) {
+                  if (cchars[j] == sta->c) {
+                    break;
+                  }
+                }
+                if (j == ccount) {
+                  result = FAIL;
+                  break;
+                }
+                sta = sta->out;
+              }
+            } else {
+              result = FAIL;
+            }
+
+            if (t->state->out->out1->c == NFA_END_COMPOSING) {
+              end = t->state->out->out1;
+              ADD_STATE_IF_MATCH(end);
+            }
+            break;
+          }
           if (state->c == NFA_END_COLL) {
             result = !result_if_matched;
             break;
@@ -15545,6 +15767,9 @@ static regengine_T bt_regengine = {
   bt_regfree,
   bt_regexec_nl,
   bt_regexec_multi,
+#ifdef REGEXP_DEBUG
+  "",
+#endif
 };
 
 static regengine_T nfa_regengine = {
@@ -15552,6 +15777,9 @@ static regengine_T nfa_regengine = {
   nfa_regfree,
   nfa_regexec_nl,
   nfa_regexec_multi,
+#ifdef REGEXP_DEBUG
+  "",
+#endif
 };
 
 // Which regexp engine to use? Needed for vim_regcomp().