diff options
Diffstat (limited to 'src/nvim/regexp_nfa.c')
| -rw-r--r-- | src/nvim/regexp_nfa.c | 276 |
1 files changed, 141 insertions, 135 deletions
diff --git a/src/nvim/regexp_nfa.c b/src/nvim/regexp_nfa.c index c8b7190b4a..5df5cc5975 100644 --- a/src/nvim/regexp_nfa.c +++ b/src/nvim/regexp_nfa.c @@ -48,14 +48,14 @@ enum { NFA_MATCH, NFA_EMPTY, /* matches 0-length */ - NFA_START_COLL, /* [abc] start */ - NFA_END_COLL, /* [abc] end */ - NFA_START_NEG_COLL, /* [^abc] start */ - NFA_END_NEG_COLL, /* [^abc] end (postfix only) */ - NFA_RANGE, /* range of the two previous items - * (postfix only) */ - NFA_RANGE_MIN, /* low end of a range */ - NFA_RANGE_MAX, /* high end of a range */ + NFA_START_COLL, // [abc] start + NFA_END_COLL, // [abc] end + NFA_START_NEG_COLL, // [^abc] start + NFA_END_NEG_COLL, // [^abc] end (postfix only) + NFA_RANGE, // range of the two previous items + // (postfix only) + NFA_RANGE_MIN, // low end of a range + NFA_RANGE_MAX, // high end of a range NFA_CONCAT, // concatenate two previous items (postfix // only) @@ -88,9 +88,9 @@ enum { NFA_END_INVISIBLE, NFA_END_INVISIBLE_NEG, NFA_END_PATTERN, - NFA_COMPOSING, /* Next nodes in NFA are part of the - composing multibyte char */ - NFA_END_COMPOSING, /* End of a composing char in the NFA */ + NFA_COMPOSING, // Next nodes in NFA are part of the + // composing multibyte char + NFA_END_COMPOSING, // End of a composing char in the NFA NFA_ANY_COMPOSING, // \%C: Any composing characters. NFA_OPT_CHARS, /* \%[abc] */ @@ -256,9 +256,9 @@ static char_u e_ill_char_class[] = N_( "E877: (NFA regexp) Invalid character class: %" PRId64); static char_u e_value_too_large[] = N_("E951: \\% value too large"); -/* Since the out pointers in the list are always - * uninitialized, we use the pointers themselves - * as storage for the Ptrlists. */ +// Since the out pointers in the list are always +// uninitialized, we use the pointers themselves +// as storage for the Ptrlists. typedef union Ptrlist Ptrlist; union Ptrlist { Ptrlist *next; @@ -310,9 +310,9 @@ struct nfa_pim_S { typedef struct { nfa_state_T *state; int count; - nfa_pim_T pim; /* if pim.result != NFA_PIM_UNUSED: postponed - * invisible match */ - regsubs_T subs; /* submatch info, only party used */ + nfa_pim_T pim; // if pim.result != NFA_PIM_UNUSED: postponed + // invisible match + regsubs_T subs; // submatch info, only party used } nfa_thread_T; // nfa_list_T contains the alternative NFA execution states. @@ -543,7 +543,7 @@ static char_u *nfa_get_match_text(nfa_state_T *start) return NULL; /* just in case */ p = p->out; while (p->c > 0) { - len += MB_CHAR2LEN(p->c); + len += utf_char2len(p->c); p = p->out; } if (p->c != NFA_MCLOSE || p->out->c != NFA_MATCH) @@ -1244,11 +1244,11 @@ static int nfa_regatom(void) p = vim_strchr(classchars, no_Magic(c)); if (p == NULL) { if (extra == NFA_ADD_NL) { - EMSGN(_(e_ill_char_class), c); + semsg(_(e_ill_char_class), (int64_t)c); rc_did_emsg = true; return FAIL; } - IEMSGN("INTERNAL: Unknown character class char: %" PRId64, c); + siemsg("INTERNAL: Unknown character class char: %" PRId64, (int64_t)c); return FAIL; } // When '.' is followed by a composing char ignore the dot, so that @@ -1286,7 +1286,7 @@ static int nfa_regatom(void) case Magic('|'): case Magic('&'): case Magic(')'): - EMSGN(_(e_misplaced), no_Magic(c)); // -V1037 + semsg(_(e_misplaced), (int64_t)no_Magic(c)); // -V1037 return FAIL; case Magic('='): @@ -1296,7 +1296,7 @@ static int nfa_regatom(void) case Magic('*'): case Magic('{'): // these should follow an atom, not form an atom - EMSGN(_(e_misplaced), no_Magic(c)); + semsg(_(e_misplaced), (int64_t)no_Magic(c)); return FAIL; case Magic('~'): @@ -1306,13 +1306,14 @@ static int nfa_regatom(void) // Previous substitute pattern. // Generated as "\%(pattern\)". if (reg_prev_sub == NULL) { - EMSG(_(e_nopresub)); + emsg(_(e_nopresub)); return FAIL; } for (lp = reg_prev_sub; *lp != NUL; MB_CPTR_ADV(lp)) { - EMIT(PTR2CHAR(lp)); - if (lp != reg_prev_sub) + EMIT(utf_ptr2char(lp)); + if (lp != reg_prev_sub) { EMIT(NFA_CONCAT); + } } EMIT(NFA_NOPEN); break; @@ -1383,7 +1384,7 @@ static int nfa_regatom(void) re_has_z = REX_SET; break; default: - emsgf(_("E867: (NFA) Unknown operator '\\z%c'"), + semsg(_("E867: (NFA) Unknown operator '\\z%c'"), no_Magic(c)); return FAIL; } @@ -1489,7 +1490,7 @@ static int nfa_regatom(void) while (ascii_isdigit(c)) { if (n > (INT32_MAX - (c - '0')) / 10) { // overflow. - EMSG(_(e_value_too_large)); + emsg(_(e_value_too_large)); return FAIL; } n = n * 10 + (c - '0'); @@ -1516,7 +1517,7 @@ static int nfa_regatom(void) limit = INT32_MAX / MB_MAXBYTES; } if (n >= limit) { - EMSG(_(e_value_too_large)); + emsg(_(e_value_too_large)); return FAIL; } EMIT((int)n); @@ -1529,7 +1530,7 @@ static int nfa_regatom(void) break; } } - emsgf(_("E867: (NFA) Unknown operator '\\%%%c'"), + semsg(_("E867: (NFA) Unknown operator '\\%%%c'"), no_Magic(c)); return FAIL; } @@ -1675,13 +1676,13 @@ collection: } /* Try collating class like [. .] */ if (collclass != 0) { - startc = collclass; /* allow [.a.]-x as a range */ - /* Will emit the proper atom at the end of the - * while loop. */ + startc = collclass; // allow [.a.]-x as a range + // Will emit the proper atom at the end of the + // while loop. } } - /* Try a range like 'a-x' or '\t-z'. Also allows '-' as a - * start character. */ + // Try a range like 'a-x' or '\t-z'. Also allows '-' as a + // start character. if (*regparse == '-' && oldstartc != -1) { emit_range = true; startc = oldstartc; @@ -1689,11 +1690,10 @@ collection: continue; // reading the end of the range } - /* Now handle simple and escaped characters. - * Only "\]", "\^", "\]" and "\\" are special in Vi. Vim - * accepts "\t", "\e", etc., but only when the 'l' flag in - * 'cpoptions' is not included. - */ + // Now handle simple and escaped characters. + // Only "\]", "\^", "\]" and "\\" are special in Vi. Vim + // accepts "\t", "\e", etc., but only when the 'l' flag in + // 'cpoptions' is not included. if (*regparse == '\\' && regparse + 1 <= endp && (vim_strchr(REGEXP_INRANGE, regparse[1]) != NULL @@ -1723,9 +1723,10 @@ collection: } } - /* Normal printable char */ - if (startc == -1) - startc = PTR2CHAR(regparse); + // Normal printable char + if (startc == -1) { + startc = utf_ptr2char(regparse); + } /* Previous char was '-', so this char is end of range. */ if (emit_range) { @@ -1736,18 +1737,19 @@ collection: } if (endc > startc + 2) { - /* Emit a range instead of the sequence of - * individual characters. */ - if (startc == 0) - /* \x00 is translated to \x0a, start at \x01. */ + // Emit a range instead of the sequence of + // individual characters. + if (startc == 0) { + // \x00 is translated to \x0a, start at \x01. EMIT(1); - else - --post_ptr; /* remove NFA_CONCAT */ + } else { + post_ptr--; // remove NFA_CONCAT + } EMIT(endc); EMIT(NFA_RANGE); EMIT(NFA_CONCAT); - } else if ((*mb_char2len)(startc) > 1 - || (*mb_char2len)(endc) > 1) { + } else if (utf_char2len(startc) > 1 + || utf_char2len(endc) > 1) { // Emit the characters in the range. // "startc" was already emitted, so skip it. for (c = startc + 1; c <= endc; c++) { @@ -1755,8 +1757,8 @@ collection: EMIT(NFA_CONCAT); } } else { - /* Emit the range. "startc" was already emitted, so - * skip it. */ + // Emit the range. "startc" was already emitted, so + // skip it. for (c = startc + 1; c <= endc; c++) { EMIT(c); EMIT(NFA_CONCAT); @@ -1765,19 +1767,20 @@ collection: emit_range = false; startc = -1; } else { - /* This char (startc) is not part of a range. Just - * emit it. - * Normally, simply emit startc. But if we get char - * code=0 from a collating char, then replace it with - * 0x0a. - * This is needed to completely mimic the behaviour of - * the backtracking engine. */ + // This char (startc) is not part of a range. Just + // emit it. + // Normally, simply emit startc. But if we get char + // code=0 from a collating char, then replace it with + // 0x0a. + // This is needed to completely mimic the behaviour of + // the backtracking engine. if (startc == NFA_NEWL) { - /* Line break can't be matched as part of the - * collection, add an OR below. But not for negated - * range. */ - if (!negated) + // Line break can't be matched as part of the + // collection, add an OR below. But not for negated + // range. + if (!negated) { extra = NFA_ADD_NL; + } } else { if (got_coll_char == true && startc == 0) { EMIT(0x0a); @@ -1827,18 +1830,18 @@ collection: nfa_do_multibyte: // plen is length of current char with composing chars - if ((*mb_char2len)(c) != (plen = utfc_ptr2len(old_regparse)) + if (utf_char2len(c) != (plen = utfc_ptr2len(old_regparse)) || utf_iscomposing(c)) { int i = 0; - /* A base character plus composing characters, or just one - * or more composing characters. - * This requires creating a separate atom as if enclosing - * the characters in (), where NFA_COMPOSING is the ( and - * NFA_END_COMPOSING is the ). Note that right now we are - * building the postfix form, not the NFA itself; - * a composing char could be: a, b, c, NFA_COMPOSING - * where 'b' and 'c' are chars with codes > 256. */ + // A base character plus composing characters, or just one + // or more composing characters. + // This requires creating a separate atom as if enclosing + // the characters in (), where NFA_COMPOSING is the ( and + // NFA_END_COMPOSING is the ). Note that right now we are + // building the postfix form, not the NFA itself; + // a composing char could be: a, b, c, NFA_COMPOSING + // where 'b' and 'c' are chars with codes > 256. */ for (;; ) { EMIT(c); if (i > 0) @@ -1954,7 +1957,7 @@ static int nfa_regpiece(void) break; } if (i == 0) { - emsgf(_("E869: (NFA) Unknown operator '\\@%c'"), op); + semsg(_("E869: (NFA) Unknown operator '\\@%c'"), op); return FAIL; } EMIT(i); @@ -2751,7 +2754,7 @@ static void st_error(int *postfix, int *end, int *p) fclose(df); } #endif - EMSG(_("E874: (NFA) Could not pop the stack!")); + emsg(_("E874: (NFA) Could not pop the stack!")); } /* @@ -2971,8 +2974,8 @@ static int nfa_max_width(nfa_state_T *startstate, int depth) if (state->c < 0) /* don't know what this is */ return -1; - /* normal character */ - len += MB_CHAR2LEN(state->c); + // normal character + len += utf_char2len(state->c); break; } @@ -3109,9 +3112,9 @@ static nfa_state_T *post2nfa(int *postfix, int *end, int nfa_calc_size) case NFA_END_COLL: case NFA_END_NEG_COLL: - /* On the stack is the sequence starting with NFA_START_COLL or - * NFA_START_NEG_COLL and all possible characters. Patch it to - * add the output to the start. */ + // On the stack is the sequence starting with NFA_START_COLL or + // NFA_START_NEG_COLL and all possible characters. Patch it to + // add the output to the start. if (nfa_calc_size == true) { nstate++; break; @@ -3233,12 +3236,12 @@ static nfa_state_T *post2nfa(int *postfix, int *end, int nfa_calc_size) if (before) n = *++p; /* get the count */ - /* The \@= operator: match the preceding atom with zero width. - * The \@! operator: no match for the preceding atom. - * The \@<= operator: match for the preceding atom. - * The \@<! operator: no match for the preceding atom. - * Surrounds the preceding atom with START_INVISIBLE and - * END_INVISIBLE, similarly to MOPEN. */ + // The \@= operator: match the preceding atom with zero width. + // The \@! operator: no match for the preceding atom. + // The \@<= operator: match for the preceding atom. + // The \@<! operator: no match for the preceding atom. + // Surrounds the preceding atom with START_INVISIBLE and + // END_INVISIBLE, similarly to MOPEN. if (nfa_calc_size == true) { nstate += pattern ? 4 : 2; @@ -3269,11 +3272,12 @@ static nfa_state_T *post2nfa(int *postfix, int *end, int nfa_calc_size) patch(e.out, s1); PUSH(frag(s, list1(&s1->out))); if (before) { - if (n <= 0) - /* See if we can guess the maximum width, it avoids a - * lot of pointless tries. */ + if (n <= 0) { + // See if we can guess the maximum width, it avoids a + // lot of pointless tries. n = nfa_max_width(e.start, 0); - s->val = n; /* store the count */ + } + s->val = n; // store the count } } break; @@ -3516,8 +3520,8 @@ static void nfa_postprocess(nfa_regprog_T *prog) directly = ch_follows * 10 < ch_invisible; } } else { - /* normal invisible, first do the one with the - * highest failure chance */ + // normal invisible, first do the one with the + // highest failure chance directly = ch_follows < ch_invisible; } } @@ -4012,8 +4016,8 @@ static regsubs_T *addstate( case NFA_ZEND: case NFA_SPLIT: case NFA_EMPTY: - /* These nodes are not added themselves but their "out" and/or - * "out1" may be added below. */ + // These nodes are not added themselves but their "out" and/or + // "out1" may be added below. break; case NFA_BOL: @@ -4051,21 +4055,20 @@ static regsubs_T *addstate( case NFA_ZOPEN9: case NFA_NOPEN: case NFA_ZSTART: - /* These nodes need to be added so that we can bail out when it - * was added to this list before at the same position to avoid an - * endless loop for "\(\)*" */ + // These nodes need to be added so that we can bail out when it + // was added to this list before at the same position to avoid an + // endless loop for "\(\)*" default: if (state->lastlist[nfa_ll_index] == l->id && state->c != NFA_SKIP) { - /* This state is already in the list, don't add it again, - * unless it is an MOPEN that is used for a backreference or - * when there is a PIM. For NFA_MATCH check the position, - * lower position is preferred. */ + // This state is already in the list, don't add it again, + // unless it is an MOPEN that is used for a backreference or + // when there is a PIM. For NFA_MATCH check the position, + // lower position is preferred. if (!rex.nfa_has_backref && pim == NULL && !l->has_pim && state->c != NFA_MATCH) { - - /* When called from addstate_here() do insert before - * existing states. */ + // When called from addstate_here() do insert before + // existing states. if (add_here) { for (k = 0; k < l->n && k < listindex; ++k) { if (l->t[k].state->id == state->id) { @@ -4088,10 +4091,11 @@ skip_add: } } - /* Do not add the state again when it exists with the same - * positions. */ - if (has_state_with_pos(l, state, subs, pim)) + // Do not add the state again when it exists with the same + // positions. + if (has_state_with_pos(l, state, subs, pim)) { goto skip_add; + } } // When there are backreferences or PIMs the number of states may @@ -4101,7 +4105,7 @@ skip_add: const size_t newsize = newlen * sizeof(nfa_thread_T); if ((long)(newsize >> 10) >= p_mmp) { - EMSG(_(e_maxmempat)); + emsg(_(e_maxmempat)); depth--; return NULL; } @@ -4362,9 +4366,9 @@ static regsubs_T *addstate_here( int count; int listidx = *ip; - /* First add the state(s) at the end, so that we know how many there are. - * Pass the listidx as offset (avoids adding another argument to - * addstate(). */ + // First add the state(s) at the end, so that we know how many there are. + // Pass the listidx as offset (avoids adding another argument to + // addstate(). regsubs_T *r = addstate(l, state, subs, pim, -listidx - ADDSTATE_HERE_OFFSET); if (r == NULL) { return NULL; @@ -4385,13 +4389,13 @@ static regsubs_T *addstate_here( l->t[listidx] = l->t[l->n - 1]; } else if (count > 1) { if (l->n + count - 1 >= l->len) { - /* not enough space to move the new states, reallocate the list - * and move the states to the right position */ + // not enough space to move the new states, reallocate the list + // and move the states to the right position const int newlen = l->len * 3 / 2 + 50; const size_t newsize = newlen * sizeof(nfa_thread_T); if ((long)(newsize >> 10) >= p_mmp) { - EMSG(_(e_maxmempat)); + emsg(_(e_maxmempat)); return NULL; } nfa_thread_T *const newl = xmalloc(newsize); @@ -4408,8 +4412,8 @@ static regsubs_T *addstate_here( xfree(l->t); l->t = newl; } else { - /* make space for new states, then move them from the - * end to the current position */ + // make space for new states, then move them from the + // end to the current position memmove(&(l->t[listidx + count]), &(l->t[listidx + 1]), sizeof(nfa_thread_T) * (l->n - listidx - 1)); @@ -4520,7 +4524,7 @@ static int check_char_class(int class, int c) default: // should not be here :P - IEMSGN(_(e_ill_char_class), class); + siemsg(_(e_ill_char_class), (int64_t)class); return FAIL; } return FAIL; @@ -4797,7 +4801,7 @@ static int recursive_regmatch( fprintf(log_fd, "MATCH = %s\n", !result ? "false" : "OK"); fprintf(log_fd, "****************************\n"); } else { - EMSG(_(e_log_open_failed)); + emsg(_(e_log_open_failed)); log_fd = stderr; } #endif @@ -4994,9 +4998,9 @@ static long find_match_text(colnr_T startcol, int regstart, char_u *match_text) char_u *s2 = rex.line + col + regstart_len; // skip regstart while (*s1) { int c1_len = PTR2LEN(s1); - int c1 = PTR2CHAR(s1); + int c1 = utf_ptr2char(s1); int c2_len = PTR2LEN(s2); - int c2 = PTR2CHAR(s2); + int c2 = utf_ptr2char(s2); if ((c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) || c1_len != c2_len) { @@ -5008,7 +5012,7 @@ static long find_match_text(colnr_T startcol, int regstart, char_u *match_text) } if (match // check that no composing char follows - && !utf_iscomposing(PTR2CHAR(s2))) { + && !utf_iscomposing(utf_ptr2char(s2))) { cleanup_subexpr(); if (REG_MULTI) { rex.reg_startpos[0].lnum = rex.lnum; @@ -5079,7 +5083,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, FILE *debug = fopen(NFA_REGEXP_DEBUG_LOG, "a"); if (debug == NULL) { - EMSG2("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG); + semsg("(NFA) COULD NOT OPEN %s!", NFA_REGEXP_DEBUG_LOG); return false; } #endif @@ -5117,7 +5121,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, abs(start->id), code); fprintf(log_fd, "**********************************\n"); } else { - EMSG(_(e_log_open_failed)); + emsg(_(e_log_open_failed)); log_fd = stderr; } #endif @@ -5618,7 +5622,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, // Only match composing character(s), ignore base // character. Used for ".{composing}" and "{composing}" // (no preceding character). - len += mb_char2len(mc); + len += utf_char2len(mc); } if (rex.reg_icombine && len == 0) { // If \Z was present, then ignore composing characters. @@ -5634,7 +5638,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, } else if (len > 0 || mc == sta->c) { // Check base character matches first, unless ignored. if (len == 0) { - len += mb_char2len(mc); + len += utf_char2len(mc); sta = sta->out; } @@ -5643,9 +5647,10 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, while (len < clen) { mc = utf_ptr2char(rex.input + len); cchars[ccount++] = mc; - len += mb_char2len(mc); - if (ccount == MAX_MCO) + len += utf_char2len(mc); + if (ccount == MAX_MCO) { break; + } } // Check that each composing char in the pattern matches a @@ -5805,12 +5810,12 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, break; case NFA_PRINT: // \p - result = vim_isprintc(PTR2CHAR(rex.input)); + result = vim_isprintc(utf_ptr2char(rex.input)); ADD_STATE_IF_MATCH(t->state); break; case NFA_SPRINT: // \P - result = !ascii_isdigit(curc) && vim_isprintc(PTR2CHAR(rex.input)); + result = !ascii_isdigit(curc) && vim_isprintc(utf_ptr2char(rex.input)); ADD_STATE_IF_MATCH(t->state); break; @@ -6143,7 +6148,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, #ifdef REGEXP_DEBUG if (c < 0) { - IEMSGN("INTERNAL: Negative state char: %" PRId64, c); + siemsg("INTERNAL: Negative state char: %" PRId64, (int64_t)c); } #endif result = (c == curc); @@ -6303,7 +6308,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, } else { // Checking if the required start character matches is // cheaper than adding a state that won't match. - const int c = PTR2CHAR(rex.input + clen); + const int c = utf_ptr2char(rex.input + clen); if (c != prog->regstart && (!rex.reg_ic || utf_fold(c) != utf_fold(prog->regstart))) { @@ -6425,7 +6430,7 @@ static long nfa_regtry(nfa_regprog_T *prog, fprintf(f, "\n\n"); fclose(f); } else { - EMSG("Could not open temporary log file for writing"); + emsg("Could not open temporary log file for writing"); } #endif @@ -6540,7 +6545,7 @@ static long nfa_regexec_both(char_u *line, colnr_T startcol, /* Be paranoid... */ if (prog == NULL || line == NULL) { - IEMSG(_(e_null)); + iemsg(_(e_null)); goto theend; } @@ -6582,10 +6587,11 @@ static long nfa_regexec_both(char_u *line, colnr_T startcol, } if (prog->regstart != NUL) { - /* Skip ahead until a character we know the match must start with. - * When there is none there is no match. */ - if (skip_to_start(prog->regstart, &col) == FAIL) + // Skip ahead until a character we know the match must start with. + // When there is none there is no match. + if (skip_to_start(prog->regstart, &col) == FAIL) { return 0L; + } // If match_text is set it contains the full text that must match. // Nothing else to try. Doesn't handle combining chars well. |
