aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/regexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/regexp.c')
-rw-r--r--src/nvim/regexp.c318
1 files changed, 273 insertions, 45 deletions
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index 3536196a3b..86082adbb6 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -13,7 +13,7 @@
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
-#include <sys/types.h>
+#include <uv.h>
#include "nvim/ascii_defs.h"
#include "nvim/buffer_defs.h"
@@ -21,14 +21,16 @@
#include "nvim/eval.h"
#include "nvim/eval/typval.h"
#include "nvim/eval/userfunc.h"
-#include "nvim/func_attr.h"
#include "nvim/garray.h"
-#include "nvim/gettext.h"
+#include "nvim/garray_defs.h"
+#include "nvim/gettext_defs.h"
#include "nvim/globals.h"
#include "nvim/keycodes.h"
#include "nvim/macros_defs.h"
#include "nvim/mark.h"
+#include "nvim/mark_defs.h"
#include "nvim/mbyte.h"
+#include "nvim/mbyte_defs.h"
#include "nvim/memline.h"
#include "nvim/memory.h"
#include "nvim/message.h"
@@ -43,6 +45,106 @@
#include "nvim/types_defs.h"
#include "nvim/vim_defs.h"
+enum {
+ /// In the NFA engine: how many braces are allowed.
+ /// TODO(RE): Use dynamic memory allocation instead of static, like here
+ NFA_MAX_BRACES = 20,
+};
+
+enum {
+ /// In the NFA engine: how many states are allowed.
+ NFA_MAX_STATES = 100000,
+ NFA_TOO_EXPENSIVE = -1,
+};
+
+/// Which regexp engine to use? Needed for vim_regcomp().
+/// Must match with 'regexpengine'.
+enum {
+ AUTOMATIC_ENGINE = 0,
+ BACKTRACKING_ENGINE = 1,
+ NFA_ENGINE = 2,
+};
+
+/// Structure returned by vim_regcomp() to pass on to vim_regexec().
+/// This is the general structure. For the actual matcher, two specific
+/// structures are used. See code below.
+struct regprog {
+ regengine_T *engine;
+ unsigned regflags;
+ unsigned re_engine; ///< Automatic, backtracking or NFA engine.
+ unsigned re_flags; ///< Second argument for vim_regcomp().
+ bool re_in_use; ///< prog is being executed
+};
+
+/// Structure used by the back track matcher.
+/// These fields are only to be used in regexp.c!
+/// See regexp.c for an explanation.
+typedef struct {
+ // These four members implement regprog_T.
+ regengine_T *engine;
+ unsigned regflags;
+ unsigned re_engine;
+ unsigned re_flags;
+ bool re_in_use;
+
+ int regstart;
+ uint8_t reganch;
+ uint8_t *regmust;
+ int regmlen;
+ uint8_t reghasz;
+ uint8_t program[];
+} bt_regprog_T;
+
+/// Structure representing a NFA state.
+/// An NFA state may have no outgoing edge, when it is a NFA_MATCH state.
+typedef struct nfa_state nfa_state_T;
+struct nfa_state {
+ int c;
+ nfa_state_T *out;
+ nfa_state_T *out1;
+ int id;
+ int lastlist[2]; ///< 0: normal, 1: recursive
+ int val;
+};
+
+/// Structure used by the NFA matcher.
+typedef struct {
+ // These four members implement regprog_T.
+ regengine_T *engine;
+ unsigned regflags;
+ unsigned re_engine;
+ unsigned re_flags;
+ bool re_in_use;
+
+ nfa_state_T *start; ///< points into state[]
+
+ int reganch; ///< pattern starts with ^
+ int regstart; ///< char at start of pattern
+ uint8_t *match_text; ///< plain text to match with
+
+ int has_zend; ///< pattern contains \ze
+ int has_backref; ///< pattern contains \1 .. \9
+ int reghasz;
+ char *pattern;
+ int nsubexp; ///< number of ()
+ int nstate;
+ nfa_state_T state[];
+} nfa_regprog_T;
+
+struct regengine {
+ /// bt_regcomp or nfa_regcomp
+ regprog_T *(*regcomp)(uint8_t *, int);
+ /// bt_regfree or nfa_regfree
+ void (*regfree)(regprog_T *);
+ /// bt_regexec_nl or nfa_regexec_nl
+ int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool);
+ /// bt_regexec_mult or nfa_regexec_mult
+ int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *);
+#ifdef REGEXP_DEBUG
+ uint8_t *expr;
+#endif
+};
+
// Structure used to save the current input state, when it needs to be
// restored after trying a match. Used by reg_save() and reg_restore().
// Also stores the length of "backpos".
@@ -1263,6 +1365,10 @@ static bool reg_match_visual(void)
top = curbuf->b_visual.vi_end;
bot = curbuf->b_visual.vi_start;
}
+ // a substitute command may have removed some lines
+ if (bot.lnum > curbuf->b_ml.ml_line_count) {
+ bot.lnum = curbuf->b_ml.ml_line_count;
+ }
mode = curbuf->b_visual.vi_mode;
curswant = curbuf->b_visual.vi_curswant;
}
@@ -5363,9 +5469,9 @@ static bool reg_save_equal(const regsave_T *save)
// After a failed match restore the sub-expressions.
#define restore_se(savep, posp, pp) { \
- if (REG_MULTI) /* NOLINT(readability/braces) */ \
+ if (REG_MULTI) \
*(posp) = (savep)->se_u.pos; \
- else /* NOLINT */ \
+ else \
*(pp) = (savep)->se_u.ptr; }
// Tentatively set the sub-expression start to the current position (after
@@ -5866,8 +5972,8 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
#ifdef REGEXP_DEBUG
if (scan != NULL && regnarrate) {
- os_errmsg((char *)regprop(scan));
- os_errmsg("(\n");
+ fprintf(stderr, "%s", (char *)regprop(scan));
+ fprintf(stderr, "%s", "(\n");
}
#endif
@@ -5893,18 +5999,18 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
#ifdef REGEXP_DEBUG
if (regnarrate) {
- os_errmsg((char *)regprop(scan));
- os_errmsg("...\n");
+ fprintf(stderr, "%s", (char *)regprop(scan));
+ fprintf(stderr, "%s", "...\n");
if (re_extmatch_in != NULL) {
int i;
- os_errmsg(_("External submatches:\n"));
+ fprintf(stderr, _("External submatches:\n"));
for (i = 0; i < NSUBEXP; i++) {
- os_errmsg(" \"");
+ fprintf(stderr, "%s", " \"");
if (re_extmatch_in->matches[i] != NULL) {
- os_errmsg((char *)re_extmatch_in->matches[i]);
+ fprintf(stderr, "%s", (char *)re_extmatch_in->matches[i]);
}
- os_errmsg("\"\n");
+ fprintf(stderr, "%s", "\"\n");
}
}
}
@@ -6326,15 +6432,33 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
break;
case ANYOF:
- case ANYBUT:
+ case ANYBUT: {
+ uint8_t *q = OPERAND(scan);
+
if (c == NUL) {
status = RA_NOMATCH;
- } else if ((cstrchr((char *)OPERAND(scan), c) == NULL) == (op == ANYOF)) {
+ } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) {
status = RA_NOMATCH;
- } else {
- ADVANCE_REGINPUT();
+ } else { // Check following combining characters
+ int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q);
+
+ rex.input += utf_ptr2len((char *)rex.input);
+ q += utf_ptr2len((char *)q);
+
+ if (len == 0) {
+ break;
+ }
+
+ for (int i = 0; i < len; i++) {
+ if (q[i] != rex.input[i]) {
+ status = RA_NOMATCH;
+ break;
+ }
+ }
+ rex.input += len;
}
break;
+ }
case MULTIBYTECODE: {
int i, len;
@@ -6380,7 +6504,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out)
case RE_COMPOSING:
// Skip composing characters.
while (utf_iscomposing(utf_ptr2char((char *)rex.input))) {
- MB_CPTR_ADV(rex.input);
+ rex.input += utf_ptr2len((char *)rex.input);
}
break;
@@ -8688,9 +8812,9 @@ static void nfa_emit_equi_class(int c)
case 0x1eb2:
case 0x1eb4:
case 0x1eb6:
- EMIT2('A') EMIT2(A_grave) EMIT2(A_acute) // NOLINT(whitespace/cast)
- EMIT2(A_circumflex) EMIT2(A_virguilla) // NOLINT(whitespace/cast)
- EMIT2(A_diaeresis) EMIT2(A_ring) // NOLINT(whitespace/cast)
+ EMIT2('A') EMIT2(A_grave) EMIT2(A_acute)
+ EMIT2(A_circumflex) EMIT2(A_virguilla)
+ EMIT2(A_diaeresis) EMIT2(A_ring)
EMIT2(0x100) EMIT2(0x102) EMIT2(0x104)
EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0)
EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202)
@@ -8769,8 +8893,8 @@ static void nfa_emit_equi_class(int c)
case 0x1ec2:
case 0x1ec4:
case 0x1ec6:
- EMIT2('E') EMIT2(E_grave) EMIT2(E_acute) // NOLINT(whitespace/cast)
- EMIT2(E_circumflex) EMIT2(E_diaeresis) // NOLINT(whitespace/cast)
+ EMIT2('E') EMIT2(E_grave) EMIT2(E_acute)
+ EMIT2(E_circumflex) EMIT2(E_diaeresis)
EMIT2(0x112) EMIT2(0x114) EMIT2(0x116)
EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204)
EMIT2(0x206) EMIT2(0x228) EMIT2(0x246)
@@ -8838,8 +8962,8 @@ static void nfa_emit_equi_class(int c)
case 0x1e2e:
case 0x1ec8:
case 0x1eca:
- EMIT2('I') EMIT2(I_grave) EMIT2(I_acute) // NOLINT(whitespace/cast)
- EMIT2(I_circumflex) EMIT2(I_diaeresis) // NOLINT(whitespace/cast)
+ EMIT2('I') EMIT2(I_grave) EMIT2(I_acute)
+ EMIT2(I_circumflex) EMIT2(I_diaeresis)
EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c)
EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197)
EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a)
@@ -8948,9 +9072,9 @@ static void nfa_emit_equi_class(int c)
case 0x1ede:
case 0x1ee0:
case 0x1ee2:
- EMIT2('O') EMIT2(O_grave) EMIT2(O_acute) // NOLINT(whitespace/cast)
- EMIT2(O_circumflex) EMIT2(O_virguilla) // NOLINT(whitespace/cast)
- EMIT2(O_diaeresis) EMIT2(O_slash) // NOLINT(whitespace/cast)
+ EMIT2('O') EMIT2(O_grave) EMIT2(O_acute)
+ EMIT2(O_circumflex) EMIT2(O_virguilla)
+ EMIT2(O_diaeresis) EMIT2(O_slash)
EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150)
EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1)
EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe)
@@ -9065,8 +9189,8 @@ static void nfa_emit_equi_class(int c)
case 0x1eec:
case 0x1eee:
case 0x1ef0:
- EMIT2('U') EMIT2(U_grave) EMIT2(U_acute) // NOLINT(whitespace/cast)
- EMIT2(U_diaeresis) EMIT2(U_circumflex) // NOLINT(whitespace/cast)
+ EMIT2('U') EMIT2(U_grave) EMIT2(U_acute)
+ EMIT2(U_diaeresis) EMIT2(U_circumflex)
EMIT2(0x168) EMIT2(0x16a)
EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170)
EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3)
@@ -9169,9 +9293,9 @@ static void nfa_emit_equi_class(int c)
case 0x1eb5:
case 0x1eb7:
case 0x2c65:
- EMIT2('a') EMIT2(a_grave) EMIT2(a_acute) // NOLINT(whitespace/cast)
- EMIT2(a_circumflex) EMIT2(a_virguilla) // NOLINT(whitespace/cast)
- EMIT2(a_diaeresis) EMIT2(a_ring) // NOLINT(whitespace/cast)
+ EMIT2('a') EMIT2(a_grave) EMIT2(a_acute)
+ EMIT2(a_circumflex) EMIT2(a_virguilla)
+ EMIT2(a_diaeresis) EMIT2(a_ring)
EMIT2(0x101) EMIT2(0x103) EMIT2(0x105)
EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1)
EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203)
@@ -9258,8 +9382,8 @@ static void nfa_emit_equi_class(int c)
case 0x1ec3:
case 0x1ec5:
case 0x1ec7:
- EMIT2('e') EMIT2(e_grave) EMIT2(e_acute) // NOLINT(whitespace/cast)
- EMIT2(e_circumflex) EMIT2(e_diaeresis) // NOLINT(whitespace/cast)
+ EMIT2('e') EMIT2(e_grave) EMIT2(e_acute)
+ EMIT2(e_circumflex) EMIT2(e_diaeresis)
EMIT2(0x113) EMIT2(0x115)
EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b)
EMIT2(0x205) EMIT2(0x207) EMIT2(0x229)
@@ -9334,8 +9458,8 @@ static void nfa_emit_equi_class(int c)
case 0x1e2f:
case 0x1ec9:
case 0x1ecb:
- EMIT2('i') EMIT2(i_grave) EMIT2(i_acute) // NOLINT(whitespace/cast)
- EMIT2(i_circumflex) EMIT2(i_diaeresis) // NOLINT(whitespace/cast)
+ EMIT2('i') EMIT2(i_grave) EMIT2(i_acute)
+ EMIT2(i_circumflex) EMIT2(i_diaeresis)
EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d)
EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209)
EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96)
@@ -9451,9 +9575,9 @@ static void nfa_emit_equi_class(int c)
case 0x1edf:
case 0x1ee1:
case 0x1ee3:
- EMIT2('o') EMIT2(o_grave) EMIT2(o_acute) // NOLINT(whitespace/cast)
- EMIT2(o_circumflex) EMIT2(o_virguilla) // NOLINT(whitespace/cast)
- EMIT2(o_diaeresis) EMIT2(o_slash) // NOLINT(whitespace/cast)
+ EMIT2('o') EMIT2(o_grave) EMIT2(o_acute)
+ EMIT2(o_circumflex) EMIT2(o_virguilla)
+ EMIT2(o_diaeresis) EMIT2(o_slash)
EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151)
EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb)
EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d)
@@ -9582,8 +9706,8 @@ static void nfa_emit_equi_class(int c)
case 0x1eed:
case 0x1eef:
case 0x1ef1:
- EMIT2('u') EMIT2(u_grave) EMIT2(u_acute) // NOLINT(whitespace/cast)
- EMIT2(u_circumflex) EMIT2(u_diaeresis) // NOLINT(whitespace/cast)
+ EMIT2('u') EMIT2(u_grave) EMIT2(u_acute)
+ EMIT2(u_circumflex) EMIT2(u_diaeresis)
EMIT2(0x169) EMIT2(0x16b)
EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171)
EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8)
@@ -9636,7 +9760,7 @@ static void nfa_emit_equi_class(int c)
case 0x1ef5:
case 0x1ef7:
case 0x1ef9:
- EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis) // NOLINT(whitespace/cast)
+ EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis)
EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f)
EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3)
EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9)
@@ -9841,7 +9965,7 @@ static int nfa_regatom(void)
emsg(_(e_nopresub));
return FAIL;
}
- for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp)) {
+ for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; lp += utf_ptr2len((char *)lp)) {
EMIT(utf_ptr2char((char *)lp));
if (lp != (uint8_t *)reg_prev_sub) {
EMIT(NFA_CONCAT);
@@ -10348,13 +10472,39 @@ collection:
} else {
if (got_coll_char == true && startc == 0) {
EMIT(0x0a);
+ EMIT(NFA_CONCAT);
} else {
EMIT(startc);
+ if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) {
+ EMIT(NFA_CONCAT);
+ }
}
- EMIT(NFA_CONCAT);
}
}
+ int plen;
+ if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) {
+ int i = utf_ptr2len(regparse);
+
+ c = utf_ptr2char(regparse + i);
+
+ // Add composing characters
+ while (true) {
+ if (c == 0) {
+ // \x00 is translated to \x0a, start at \x01.
+ EMIT(1);
+ } else {
+ EMIT(c);
+ }
+ EMIT(NFA_CONCAT);
+ if ((i += utf_char2len(c)) >= plen) {
+ break;
+ }
+ c = utf_ptr2char(regparse + i);
+ }
+ EMIT(NFA_COMPOSING);
+ EMIT(NFA_CONCAT);
+ }
MB_PTR_ADV(regparse);
} // while (p < endp)
@@ -14403,6 +14553,78 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm
state = t->state->out;
result_if_matched = (t->state->c == NFA_START_COLL);
while (true) {
+ if (state->c == NFA_COMPOSING) {
+ int mc = curc;
+ int len = 0;
+ nfa_state_T *end;
+ nfa_state_T *sta;
+ int cchars[MAX_MCO];
+ int ccount = 0;
+ int j;
+
+ sta = t->state->out->out;
+ if (utf_iscomposing(sta->c)) {
+ // Only match composing character(s), ignore base
+ // character. Used for ".{composing}" and "{composing}"
+ // (no preceding character).
+ len += utf_char2len(mc);
+ }
+ if (rex.reg_icombine && len == 0) {
+ // If \Z was present, then ignore composing characters.
+ // When ignoring the base character this always matches.
+ if (sta->c != curc) {
+ result = FAIL;
+ } else {
+ result = OK;
+ }
+ while (sta->c != NFA_END_COMPOSING) {
+ sta = sta->out;
+ }
+ }
+ // Check base character matches first, unless ignored.
+ else if (len > 0 || mc == sta->c) {
+ if (len == 0) {
+ len += utf_char2len(mc);
+ sta = sta->out;
+ }
+
+ // We don't care about the order of composing characters.
+ // Get them into cchars[] first.
+ while (len < clen) {
+ mc = utf_ptr2char((char *)rex.input + len);
+ cchars[ccount++] = mc;
+ len += utf_char2len(mc);
+ if (ccount == MAX_MCO) {
+ break;
+ }
+ }
+
+ // Check that each composing char in the pattern matches a
+ // composing char in the text. We do not check if all
+ // composing chars are matched.
+ result = OK;
+ while (sta->c != NFA_END_COMPOSING) {
+ for (j = 0; j < ccount; j++) {
+ if (cchars[j] == sta->c) {
+ break;
+ }
+ }
+ if (j == ccount) {
+ result = FAIL;
+ break;
+ }
+ sta = sta->out;
+ }
+ } else {
+ result = FAIL;
+ }
+
+ if (t->state->out->out1->c == NFA_END_COMPOSING) {
+ end = t->state->out->out1;
+ ADD_STATE_IF_MATCH(end);
+ }
+ break;
+ }
if (state->c == NFA_END_COLL) {
result = !result_if_matched;
break;
@@ -15545,6 +15767,9 @@ static regengine_T bt_regengine = {
bt_regfree,
bt_regexec_nl,
bt_regexec_multi,
+#ifdef REGEXP_DEBUG
+ "",
+#endif
};
static regengine_T nfa_regengine = {
@@ -15552,6 +15777,9 @@ static regengine_T nfa_regengine = {
nfa_regfree,
nfa_regexec_nl,
nfa_regexec_multi,
+#ifdef REGEXP_DEBUG
+ "",
+#endif
};
// Which regexp engine to use? Needed for vim_regcomp().