diff options
Diffstat (limited to 'src/nvim/regexp.c')
-rw-r--r-- | src/nvim/regexp.c | 318 |
1 files changed, 273 insertions, 45 deletions
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 3536196a3b..86082adbb6 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -13,7 +13,7 @@ #include <stdbool.h> #include <stddef.h> #include <string.h> -#include <sys/types.h> +#include <uv.h> #include "nvim/ascii_defs.h" #include "nvim/buffer_defs.h" @@ -21,14 +21,16 @@ #include "nvim/eval.h" #include "nvim/eval/typval.h" #include "nvim/eval/userfunc.h" -#include "nvim/func_attr.h" #include "nvim/garray.h" -#include "nvim/gettext.h" +#include "nvim/garray_defs.h" +#include "nvim/gettext_defs.h" #include "nvim/globals.h" #include "nvim/keycodes.h" #include "nvim/macros_defs.h" #include "nvim/mark.h" +#include "nvim/mark_defs.h" #include "nvim/mbyte.h" +#include "nvim/mbyte_defs.h" #include "nvim/memline.h" #include "nvim/memory.h" #include "nvim/message.h" @@ -43,6 +45,106 @@ #include "nvim/types_defs.h" #include "nvim/vim_defs.h" +enum { + /// In the NFA engine: how many braces are allowed. + /// TODO(RE): Use dynamic memory allocation instead of static, like here + NFA_MAX_BRACES = 20, +}; + +enum { + /// In the NFA engine: how many states are allowed. + NFA_MAX_STATES = 100000, + NFA_TOO_EXPENSIVE = -1, +}; + +/// Which regexp engine to use? Needed for vim_regcomp(). +/// Must match with 'regexpengine'. +enum { + AUTOMATIC_ENGINE = 0, + BACKTRACKING_ENGINE = 1, + NFA_ENGINE = 2, +}; + +/// Structure returned by vim_regcomp() to pass on to vim_regexec(). +/// This is the general structure. For the actual matcher, two specific +/// structures are used. See code below. +struct regprog { + regengine_T *engine; + unsigned regflags; + unsigned re_engine; ///< Automatic, backtracking or NFA engine. + unsigned re_flags; ///< Second argument for vim_regcomp(). + bool re_in_use; ///< prog is being executed +}; + +/// Structure used by the back track matcher. +/// These fields are only to be used in regexp.c! +/// See regexp.c for an explanation. +typedef struct { + // These four members implement regprog_T. + regengine_T *engine; + unsigned regflags; + unsigned re_engine; + unsigned re_flags; + bool re_in_use; + + int regstart; + uint8_t reganch; + uint8_t *regmust; + int regmlen; + uint8_t reghasz; + uint8_t program[]; +} bt_regprog_T; + +/// Structure representing a NFA state. +/// An NFA state may have no outgoing edge, when it is a NFA_MATCH state. +typedef struct nfa_state nfa_state_T; +struct nfa_state { + int c; + nfa_state_T *out; + nfa_state_T *out1; + int id; + int lastlist[2]; ///< 0: normal, 1: recursive + int val; +}; + +/// Structure used by the NFA matcher. +typedef struct { + // These four members implement regprog_T. + regengine_T *engine; + unsigned regflags; + unsigned re_engine; + unsigned re_flags; + bool re_in_use; + + nfa_state_T *start; ///< points into state[] + + int reganch; ///< pattern starts with ^ + int regstart; ///< char at start of pattern + uint8_t *match_text; ///< plain text to match with + + int has_zend; ///< pattern contains \ze + int has_backref; ///< pattern contains \1 .. \9 + int reghasz; + char *pattern; + int nsubexp; ///< number of () + int nstate; + nfa_state_T state[]; +} nfa_regprog_T; + +struct regengine { + /// bt_regcomp or nfa_regcomp + regprog_T *(*regcomp)(uint8_t *, int); + /// bt_regfree or nfa_regfree + void (*regfree)(regprog_T *); + /// bt_regexec_nl or nfa_regexec_nl + int (*regexec_nl)(regmatch_T *, uint8_t *, colnr_T, bool); + /// bt_regexec_mult or nfa_regexec_mult + int (*regexec_multi)(regmmatch_T *, win_T *, buf_T *, linenr_T, colnr_T, proftime_T *, int *); +#ifdef REGEXP_DEBUG + uint8_t *expr; +#endif +}; + // Structure used to save the current input state, when it needs to be // restored after trying a match. Used by reg_save() and reg_restore(). // Also stores the length of "backpos". @@ -1263,6 +1365,10 @@ static bool reg_match_visual(void) top = curbuf->b_visual.vi_end; bot = curbuf->b_visual.vi_start; } + // a substitute command may have removed some lines + if (bot.lnum > curbuf->b_ml.ml_line_count) { + bot.lnum = curbuf->b_ml.ml_line_count; + } mode = curbuf->b_visual.vi_mode; curswant = curbuf->b_visual.vi_curswant; } @@ -5363,9 +5469,9 @@ static bool reg_save_equal(const regsave_T *save) // After a failed match restore the sub-expressions. #define restore_se(savep, posp, pp) { \ - if (REG_MULTI) /* NOLINT(readability/braces) */ \ + if (REG_MULTI) \ *(posp) = (savep)->se_u.pos; \ - else /* NOLINT */ \ + else \ *(pp) = (savep)->se_u.ptr; } // Tentatively set the sub-expression start to the current position (after @@ -5866,8 +5972,8 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) #ifdef REGEXP_DEBUG if (scan != NULL && regnarrate) { - os_errmsg((char *)regprop(scan)); - os_errmsg("(\n"); + fprintf(stderr, "%s", (char *)regprop(scan)); + fprintf(stderr, "%s", "(\n"); } #endif @@ -5893,18 +5999,18 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) #ifdef REGEXP_DEBUG if (regnarrate) { - os_errmsg((char *)regprop(scan)); - os_errmsg("...\n"); + fprintf(stderr, "%s", (char *)regprop(scan)); + fprintf(stderr, "%s", "...\n"); if (re_extmatch_in != NULL) { int i; - os_errmsg(_("External submatches:\n")); + fprintf(stderr, _("External submatches:\n")); for (i = 0; i < NSUBEXP; i++) { - os_errmsg(" \""); + fprintf(stderr, "%s", " \""); if (re_extmatch_in->matches[i] != NULL) { - os_errmsg((char *)re_extmatch_in->matches[i]); + fprintf(stderr, "%s", (char *)re_extmatch_in->matches[i]); } - os_errmsg("\"\n"); + fprintf(stderr, "%s", "\"\n"); } } } @@ -6326,15 +6432,33 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) break; case ANYOF: - case ANYBUT: + case ANYBUT: { + uint8_t *q = OPERAND(scan); + if (c == NUL) { status = RA_NOMATCH; - } else if ((cstrchr((char *)OPERAND(scan), c) == NULL) == (op == ANYOF)) { + } else if ((cstrchr((char *)q, c) == NULL) == (op == ANYOF)) { status = RA_NOMATCH; - } else { - ADVANCE_REGINPUT(); + } else { // Check following combining characters + int len = utfc_ptr2len((char *)q) - utf_ptr2len((char *)q); + + rex.input += utf_ptr2len((char *)rex.input); + q += utf_ptr2len((char *)q); + + if (len == 0) { + break; + } + + for (int i = 0; i < len; i++) { + if (q[i] != rex.input[i]) { + status = RA_NOMATCH; + break; + } + } + rex.input += len; } break; + } case MULTIBYTECODE: { int i, len; @@ -6380,7 +6504,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) case RE_COMPOSING: // Skip composing characters. while (utf_iscomposing(utf_ptr2char((char *)rex.input))) { - MB_CPTR_ADV(rex.input); + rex.input += utf_ptr2len((char *)rex.input); } break; @@ -8688,9 +8812,9 @@ static void nfa_emit_equi_class(int c) case 0x1eb2: case 0x1eb4: case 0x1eb6: - EMIT2('A') EMIT2(A_grave) EMIT2(A_acute) // NOLINT(whitespace/cast) - EMIT2(A_circumflex) EMIT2(A_virguilla) // NOLINT(whitespace/cast) - EMIT2(A_diaeresis) EMIT2(A_ring) // NOLINT(whitespace/cast) + EMIT2('A') EMIT2(A_grave) EMIT2(A_acute) + EMIT2(A_circumflex) EMIT2(A_virguilla) + EMIT2(A_diaeresis) EMIT2(A_ring) EMIT2(0x100) EMIT2(0x102) EMIT2(0x104) EMIT2(0x1cd) EMIT2(0x1de) EMIT2(0x1e0) EMIT2(0x1fa) EMIT2(0x200) EMIT2(0x202) @@ -8769,8 +8893,8 @@ static void nfa_emit_equi_class(int c) case 0x1ec2: case 0x1ec4: case 0x1ec6: - EMIT2('E') EMIT2(E_grave) EMIT2(E_acute) // NOLINT(whitespace/cast) - EMIT2(E_circumflex) EMIT2(E_diaeresis) // NOLINT(whitespace/cast) + EMIT2('E') EMIT2(E_grave) EMIT2(E_acute) + EMIT2(E_circumflex) EMIT2(E_diaeresis) EMIT2(0x112) EMIT2(0x114) EMIT2(0x116) EMIT2(0x118) EMIT2(0x11a) EMIT2(0x204) EMIT2(0x206) EMIT2(0x228) EMIT2(0x246) @@ -8838,8 +8962,8 @@ static void nfa_emit_equi_class(int c) case 0x1e2e: case 0x1ec8: case 0x1eca: - EMIT2('I') EMIT2(I_grave) EMIT2(I_acute) // NOLINT(whitespace/cast) - EMIT2(I_circumflex) EMIT2(I_diaeresis) // NOLINT(whitespace/cast) + EMIT2('I') EMIT2(I_grave) EMIT2(I_acute) + EMIT2(I_circumflex) EMIT2(I_diaeresis) EMIT2(0x128) EMIT2(0x12a) EMIT2(0x12c) EMIT2(0x12e) EMIT2(0x130) EMIT2(0x197) EMIT2(0x1cf) EMIT2(0x208) EMIT2(0x20a) @@ -8948,9 +9072,9 @@ static void nfa_emit_equi_class(int c) case 0x1ede: case 0x1ee0: case 0x1ee2: - EMIT2('O') EMIT2(O_grave) EMIT2(O_acute) // NOLINT(whitespace/cast) - EMIT2(O_circumflex) EMIT2(O_virguilla) // NOLINT(whitespace/cast) - EMIT2(O_diaeresis) EMIT2(O_slash) // NOLINT(whitespace/cast) + EMIT2('O') EMIT2(O_grave) EMIT2(O_acute) + EMIT2(O_circumflex) EMIT2(O_virguilla) + EMIT2(O_diaeresis) EMIT2(O_slash) EMIT2(0x14c) EMIT2(0x14e) EMIT2(0x150) EMIT2(0x19f) EMIT2(0x1a0) EMIT2(0x1d1) EMIT2(0x1ea) EMIT2(0x1ec) EMIT2(0x1fe) @@ -9065,8 +9189,8 @@ static void nfa_emit_equi_class(int c) case 0x1eec: case 0x1eee: case 0x1ef0: - EMIT2('U') EMIT2(U_grave) EMIT2(U_acute) // NOLINT(whitespace/cast) - EMIT2(U_diaeresis) EMIT2(U_circumflex) // NOLINT(whitespace/cast) + EMIT2('U') EMIT2(U_grave) EMIT2(U_acute) + EMIT2(U_diaeresis) EMIT2(U_circumflex) EMIT2(0x168) EMIT2(0x16a) EMIT2(0x16c) EMIT2(0x16e) EMIT2(0x170) EMIT2(0x172) EMIT2(0x1af) EMIT2(0x1d3) @@ -9169,9 +9293,9 @@ static void nfa_emit_equi_class(int c) case 0x1eb5: case 0x1eb7: case 0x2c65: - EMIT2('a') EMIT2(a_grave) EMIT2(a_acute) // NOLINT(whitespace/cast) - EMIT2(a_circumflex) EMIT2(a_virguilla) // NOLINT(whitespace/cast) - EMIT2(a_diaeresis) EMIT2(a_ring) // NOLINT(whitespace/cast) + EMIT2('a') EMIT2(a_grave) EMIT2(a_acute) + EMIT2(a_circumflex) EMIT2(a_virguilla) + EMIT2(a_diaeresis) EMIT2(a_ring) EMIT2(0x101) EMIT2(0x103) EMIT2(0x105) EMIT2(0x1ce) EMIT2(0x1df) EMIT2(0x1e1) EMIT2(0x1fb) EMIT2(0x201) EMIT2(0x203) @@ -9258,8 +9382,8 @@ static void nfa_emit_equi_class(int c) case 0x1ec3: case 0x1ec5: case 0x1ec7: - EMIT2('e') EMIT2(e_grave) EMIT2(e_acute) // NOLINT(whitespace/cast) - EMIT2(e_circumflex) EMIT2(e_diaeresis) // NOLINT(whitespace/cast) + EMIT2('e') EMIT2(e_grave) EMIT2(e_acute) + EMIT2(e_circumflex) EMIT2(e_diaeresis) EMIT2(0x113) EMIT2(0x115) EMIT2(0x117) EMIT2(0x119) EMIT2(0x11b) EMIT2(0x205) EMIT2(0x207) EMIT2(0x229) @@ -9334,8 +9458,8 @@ static void nfa_emit_equi_class(int c) case 0x1e2f: case 0x1ec9: case 0x1ecb: - EMIT2('i') EMIT2(i_grave) EMIT2(i_acute) // NOLINT(whitespace/cast) - EMIT2(i_circumflex) EMIT2(i_diaeresis) // NOLINT(whitespace/cast) + EMIT2('i') EMIT2(i_grave) EMIT2(i_acute) + EMIT2(i_circumflex) EMIT2(i_diaeresis) EMIT2(0x129) EMIT2(0x12b) EMIT2(0x12d) EMIT2(0x12f) EMIT2(0x1d0) EMIT2(0x209) EMIT2(0x20b) EMIT2(0x268) EMIT2(0x1d96) @@ -9451,9 +9575,9 @@ static void nfa_emit_equi_class(int c) case 0x1edf: case 0x1ee1: case 0x1ee3: - EMIT2('o') EMIT2(o_grave) EMIT2(o_acute) // NOLINT(whitespace/cast) - EMIT2(o_circumflex) EMIT2(o_virguilla) // NOLINT(whitespace/cast) - EMIT2(o_diaeresis) EMIT2(o_slash) // NOLINT(whitespace/cast) + EMIT2('o') EMIT2(o_grave) EMIT2(o_acute) + EMIT2(o_circumflex) EMIT2(o_virguilla) + EMIT2(o_diaeresis) EMIT2(o_slash) EMIT2(0x14d) EMIT2(0x14f) EMIT2(0x151) EMIT2(0x1a1) EMIT2(0x1d2) EMIT2(0x1eb) EMIT2(0x1ed) EMIT2(0x1ff) EMIT2(0x20d) @@ -9582,8 +9706,8 @@ static void nfa_emit_equi_class(int c) case 0x1eed: case 0x1eef: case 0x1ef1: - EMIT2('u') EMIT2(u_grave) EMIT2(u_acute) // NOLINT(whitespace/cast) - EMIT2(u_circumflex) EMIT2(u_diaeresis) // NOLINT(whitespace/cast) + EMIT2('u') EMIT2(u_grave) EMIT2(u_acute) + EMIT2(u_circumflex) EMIT2(u_diaeresis) EMIT2(0x169) EMIT2(0x16b) EMIT2(0x16d) EMIT2(0x16f) EMIT2(0x171) EMIT2(0x173) EMIT2(0x1d6) EMIT2(0x1d8) @@ -9636,7 +9760,7 @@ static void nfa_emit_equi_class(int c) case 0x1ef5: case 0x1ef7: case 0x1ef9: - EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis) // NOLINT(whitespace/cast) + EMIT2('y') EMIT2(y_acute) EMIT2(y_diaeresis) EMIT2(0x177) EMIT2(0x1b4) EMIT2(0x233) EMIT2(0x24f) EMIT2(0x1e8f) EMIT2(0x1e99) EMIT2(0x1ef3) EMIT2(0x1ef5) EMIT2(0x1ef7) EMIT2(0x1ef9) @@ -9841,7 +9965,7 @@ static int nfa_regatom(void) emsg(_(e_nopresub)); return FAIL; } - for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp)) { + for (lp = (uint8_t *)reg_prev_sub; *lp != NUL; lp += utf_ptr2len((char *)lp)) { EMIT(utf_ptr2char((char *)lp)); if (lp != (uint8_t *)reg_prev_sub) { EMIT(NFA_CONCAT); @@ -10348,13 +10472,39 @@ collection: } else { if (got_coll_char == true && startc == 0) { EMIT(0x0a); + EMIT(NFA_CONCAT); } else { EMIT(startc); + if (utf_ptr2len(regparse) == utfc_ptr2len(regparse)) { + EMIT(NFA_CONCAT); + } } - EMIT(NFA_CONCAT); } } + int plen; + if (utf_ptr2len(regparse) != (plen = utfc_ptr2len(regparse))) { + int i = utf_ptr2len(regparse); + + c = utf_ptr2char(regparse + i); + + // Add composing characters + while (true) { + if (c == 0) { + // \x00 is translated to \x0a, start at \x01. + EMIT(1); + } else { + EMIT(c); + } + EMIT(NFA_CONCAT); + if ((i += utf_char2len(c)) >= plen) { + break; + } + c = utf_ptr2char(regparse + i); + } + EMIT(NFA_COMPOSING); + EMIT(NFA_CONCAT); + } MB_PTR_ADV(regparse); } // while (p < endp) @@ -14403,6 +14553,78 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm state = t->state->out; result_if_matched = (t->state->c == NFA_START_COLL); while (true) { + if (state->c == NFA_COMPOSING) { + int mc = curc; + int len = 0; + nfa_state_T *end; + nfa_state_T *sta; + int cchars[MAX_MCO]; + int ccount = 0; + int j; + + sta = t->state->out->out; + if (utf_iscomposing(sta->c)) { + // Only match composing character(s), ignore base + // character. Used for ".{composing}" and "{composing}" + // (no preceding character). + len += utf_char2len(mc); + } + if (rex.reg_icombine && len == 0) { + // If \Z was present, then ignore composing characters. + // When ignoring the base character this always matches. + if (sta->c != curc) { + result = FAIL; + } else { + result = OK; + } + while (sta->c != NFA_END_COMPOSING) { + sta = sta->out; + } + } + // Check base character matches first, unless ignored. + else if (len > 0 || mc == sta->c) { + if (len == 0) { + len += utf_char2len(mc); + sta = sta->out; + } + + // We don't care about the order of composing characters. + // Get them into cchars[] first. + while (len < clen) { + mc = utf_ptr2char((char *)rex.input + len); + cchars[ccount++] = mc; + len += utf_char2len(mc); + if (ccount == MAX_MCO) { + break; + } + } + + // Check that each composing char in the pattern matches a + // composing char in the text. We do not check if all + // composing chars are matched. + result = OK; + while (sta->c != NFA_END_COMPOSING) { + for (j = 0; j < ccount; j++) { + if (cchars[j] == sta->c) { + break; + } + } + if (j == ccount) { + result = FAIL; + break; + } + sta = sta->out; + } + } else { + result = FAIL; + } + + if (t->state->out->out1->c == NFA_END_COMPOSING) { + end = t->state->out->out1; + ADD_STATE_IF_MATCH(end); + } + break; + } if (state->c == NFA_END_COLL) { result = !result_if_matched; break; @@ -15545,6 +15767,9 @@ static regengine_T bt_regengine = { bt_regfree, bt_regexec_nl, bt_regexec_multi, +#ifdef REGEXP_DEBUG + "", +#endif }; static regengine_T nfa_regengine = { @@ -15552,6 +15777,9 @@ static regengine_T nfa_regengine = { nfa_regfree, nfa_regexec_nl, nfa_regexec_multi, +#ifdef REGEXP_DEBUG + "", +#endif }; // Which regexp engine to use? Needed for vim_regcomp(). |