From 6364fc617ded29100c1aa3103e189fd983dd5e64 Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Thu, 28 Mar 2024 18:15:41 +0800 Subject: vim-patch:9.1.0217: regexp: verymagic cannot match before/after a mark (#28074) Problem: regexp: verymagic cannot match before/after a mark Solution: Correctly check for the very magic check (Julio B) Fix regexp parser for \v%>'m and \v%<'m Currently \v%'m works fine, but it is unable to match before or after the position of mark m. closes: vim/vim#14309 https://github.com/vim/vim/commit/46fa3c7e271eb2abb05a0d9e6dbc9c36c2b2da02 Co-authored-by: Julio B --- src/nvim/regexp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 86082adbb6..08c804bca5 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -4494,7 +4494,7 @@ static uint8_t *regatom(int *flagp) n = n * 10 + (uint32_t)(c - '0'); c = getchr(); } - if (c == '\'' && n == 0) { + if (no_Magic(c) == '\'' && n == 0) { // "\%'m", "\%<'m" and "\%>'m": Mark c = getchr(); ret = regnode(RE_MARK); @@ -10218,7 +10218,7 @@ static int nfa_regatom(void) } EMIT((int)n); break; - } else if (c == '\'' && n == 0) { + } else if (no_Magic(c) == '\'' && n == 0) { // \%'m \%<'m \%>'m EMIT(cmp == '<' ? NFA_MARK_LT : cmp == '>' ? NFA_MARK_GT : NFA_MARK); -- cgit From f49408454ddb48016d51b48bcd9d5dab538f5cc7 Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Wed, 10 Apr 2024 07:08:49 +0800 Subject: vim-patch:9.1.0296: regexp: engines do not handle case-folding well (#28259) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: Regex engines do not handle case-folding well Solution: Correctly calculate byte length of characters to skip When the regexp engine compares two utf-8 codepoints case insensitively it may match an adjacent character, because it assumes it can step over as many bytes as the pattern contains. This however is not necessarily true because of case-folding, a multi-byte UTF-8 character can be considered equal to some single-byte value. Let's consider the pattern 'ſ' and the string 's'. When comparing and ignoring case, the single character 's' matches, and since it matches Vim will try to step over the match (by the amount of bytes of the pattern), assuming that since it matches, the length of both strings is the same. However in that case, it should only step over the single byte value 's' so by 1 byte and try to start matching after it again. So for the backtracking engine we need to ensure: - we try to match the correct length for the pattern and the text - in case of a match, we step over it correctly The same thing can happen for the NFA engine, when skipping to the next character to test for a match. We are skipping over the regstart pointer, however we do not consider the case that because of case-folding we may need to adjust the number of bytes to skip over. So this needs to be adjusted in find_match_text() as well. A related issue turned out, when prog->match_text is actually empty. In that case we should try to find the next match and skip this condition. fixes: vim/vim#14294 closes: vim/vim#14433 https://github.com/vim/vim/commit/7a27c108e0509f3255ebdcb6558e896c223e4d23 Co-authored-by: Christian Brabandt --- src/nvim/regexp.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 08c804bca5..32fb086ca6 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -1627,7 +1627,9 @@ static void mb_decompose(int c, int *c1, int *c2, int *c3) /// Compare two strings, ignore case if rex.reg_ic set. /// Return 0 if strings match, non-zero otherwise. -/// Correct the length "*n" when composing characters are ignored. +/// Correct the length "*n" when composing characters are ignored +/// or for utf8 when both utf codepoints are considered equal because of +/// case-folding but have different length (e.g. 's' and 'ſ') static int cstrncmp(char *s1, char *s2, int *n) { int result; @@ -1635,8 +1637,11 @@ static int cstrncmp(char *s1, char *s2, int *n) if (!rex.reg_ic) { result = strncmp(s1, s2, (size_t)(*n)); } else { - assert(*n >= 0); - result = mb_strnicmp(s1, s2, (size_t)(*n)); + int l2 = utfc_ptr2len(s2); + result = utf_strnicmp(s1, s2, (size_t)(*n), (size_t)l2); + if (result == 0 && l2 < *n) { + *n = l2; + } } // if it failed and it's utf8 and we want to combineignore: @@ -6490,11 +6495,9 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) } } } else { - for (i = 0; i < len; i++) { - if (opnd[i] != rex.input[i]) { - status = RA_NOMATCH; - break; - } + if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) { + status = RA_NOMATCH; + break; } } rex.input += len; @@ -13845,23 +13848,26 @@ static int skip_to_start(int c, colnr_T *colp) // Returns zero for no match, 1 for a match. static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) { -#define PTR2LEN(x) utf_ptr2len(x) - colnr_T col = *startcol; - int regstart_len = PTR2LEN((char *)rex.line + col); + const int regstart_len = utf_char2len(regstart); while (true) { bool match = true; uint8_t *s1 = match_text; - uint8_t *s2 = rex.line + col + regstart_len; // skip regstart + // skip regstart + uint8_t *s2 = rex.line + col + regstart_len; + if (regstart_len > 1 + && utf_char2len(utf_ptr2char((char *)rex.line + col)) != regstart_len) { + // because of case-folding of the previously matched text, we may need + // to skip fewer bytes than utf_char2len(regstart) + s2 = rex.line + col + utf_char2len(utf_fold(regstart)); + } while (*s1) { - int c1_len = PTR2LEN((char *)s1); + int c1_len = utf_ptr2len((char *)s1); int c1 = utf_ptr2char((char *)s1); - int c2_len = PTR2LEN((char *)s2); + int c2_len = utf_ptr2len((char *)s2); int c2 = utf_ptr2char((char *)s2); - - if ((c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) - || c1_len != c2_len) { + if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) { match = false; break; } @@ -13894,8 +13900,6 @@ static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) *startcol = col; return 0L; - -#undef PTR2LEN } static int nfa_did_time_out(void) @@ -15527,7 +15531,7 @@ static int nfa_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int // If match_text is set it contains the full text that must match. // Nothing else to try. Doesn't handle combining chars well. - if (prog->match_text != NULL && !rex.reg_icombine) { + if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) { retval = find_match_text(&col, prog->regstart, prog->match_text); if (REG_MULTI) { rex.reg_mmatch->rmm_matchcol = col; -- cgit From d0afb2dc4eb8e70942441b3c9a551dcccd6806cd Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Thu, 11 Apr 2024 07:40:16 +0800 Subject: vim-patch:9.1.0297: Patch 9.1.0296 causes too many issues (#28263) Problem: Patch 9.1.0296 causes too many issues (Tony Mechelynck, chdiza, CI) Solution: Back out the change for now Revert "patch 9.1.0296: regexp: engines do not handle case-folding well" This reverts commit 7a27c108e0509f3255ebdcb6558e896c223e4d23 it causes issues with syntax highlighting and breaks the FreeBSD and MacOS CI. It needs more work. fixes: vim/vim#14487 https://github.com/vim/vim/commit/c97f4d61cde24030f2f7d2318e1b409a0ccc3e43 Co-authored-by: Christian Brabandt --- src/nvim/regexp.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 32fb086ca6..a81990670a 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -1627,9 +1627,7 @@ static void mb_decompose(int c, int *c1, int *c2, int *c3) /// Compare two strings, ignore case if rex.reg_ic set. /// Return 0 if strings match, non-zero otherwise. -/// Correct the length "*n" when composing characters are ignored -/// or for utf8 when both utf codepoints are considered equal because of -/// case-folding but have different length (e.g. 's' and 'ſ') +/// Correct the length "*n" when composing characters are ignored. static int cstrncmp(char *s1, char *s2, int *n) { int result; @@ -1637,11 +1635,8 @@ static int cstrncmp(char *s1, char *s2, int *n) if (!rex.reg_ic) { result = strncmp(s1, s2, (size_t)(*n)); } else { - int l2 = utfc_ptr2len(s2); - result = utf_strnicmp(s1, s2, (size_t)(*n), (size_t)l2); - if (result == 0 && l2 < *n) { - *n = l2; - } + assert(*n >= 0); + result = mb_strnicmp(s1, s2, (size_t)(*n)); } // if it failed and it's utf8 and we want to combineignore: @@ -6495,9 +6490,11 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) } } } else { - if (cstrncmp((char *)opnd, (char *)rex.input, &len) != 0) { - status = RA_NOMATCH; - break; + for (i = 0; i < len; i++) { + if (opnd[i] != rex.input[i]) { + status = RA_NOMATCH; + break; + } } } rex.input += len; @@ -13849,25 +13846,19 @@ static int skip_to_start(int c, colnr_T *colp) static int find_match_text(colnr_T *startcol, int regstart, uint8_t *match_text) { colnr_T col = *startcol; - const int regstart_len = utf_char2len(regstart); + const int regstart_len = utf_ptr2len((char *)rex.line + col); while (true) { bool match = true; uint8_t *s1 = match_text; - // skip regstart - uint8_t *s2 = rex.line + col + regstart_len; - if (regstart_len > 1 - && utf_char2len(utf_ptr2char((char *)rex.line + col)) != regstart_len) { - // because of case-folding of the previously matched text, we may need - // to skip fewer bytes than utf_char2len(regstart) - s2 = rex.line + col + utf_char2len(utf_fold(regstart)); - } + uint8_t *s2 = rex.line + col + regstart_len; // skip regstart while (*s1) { int c1_len = utf_ptr2len((char *)s1); int c1 = utf_ptr2char((char *)s1); int c2_len = utf_ptr2len((char *)s2); int c2 = utf_ptr2char((char *)s2); - if (c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) { + if ((c1 != c2 && (!rex.reg_ic || utf_fold(c1) != utf_fold(c2))) + || c1_len != c2_len) { match = false; break; } @@ -15531,7 +15522,7 @@ static int nfa_regexec_both(uint8_t *line, colnr_T startcol, proftime_T *tm, int // If match_text is set it contains the full text that must match. // Nothing else to try. Doesn't handle combining chars well. - if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) { + if (prog->match_text != NULL && !rex.reg_icombine) { retval = find_match_text(&col, prog->regstart, prog->match_text); if (REG_MULTI) { rex.reg_mmatch->rmm_matchcol = col; -- cgit From f6a3fdd6848d67dc54cebb6c297f8ebdc109c3a3 Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Sun, 14 Apr 2024 21:48:32 +0800 Subject: refactor: fix clang NonNullParamChecker warnings (#28327) --- src/nvim/regexp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index a81990670a..0ce911c91a 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -5320,7 +5320,7 @@ static regprog_T *bt_regcomp(uint8_t *expr, int re_flags) } // Remember whether this pattern has any \z specials in it. r->reghasz = (uint8_t)re_has_z; - scan = r->program + 1; // First BRANCH. + scan = &r->program[1]; // First BRANCH. if (OP(regnext(scan)) == END) { // Only one top-level choice. scan = OPERAND(scan); @@ -7322,7 +7322,7 @@ static int regtry(bt_regprog_T *prog, colnr_T col, proftime_T *tm, int *timed_ou // Clear the external match subpointers if necessaey. rex.need_clear_zsubexpr = (prog->reghasz == REX_SET); - if (regmatch(prog->program + 1, tm, timed_out) == 0) { + if (regmatch(&prog->program[1], tm, timed_out) == 0) { return 0; } @@ -7664,7 +7664,7 @@ static void regdump(uint8_t *pattern, bt_regprog_T *r) fprintf(f, "-------------------------------------\n\r\nregcomp(%s):\r\n", pattern); - s = r->program + 1; + s = &r->program[1]; // Loop until we find the END that isn't before a referred next (an END // can also appear in a NOMATCH operand). while (op != END || s <= end) { -- cgit From 0ea38c9a53dfcff17703ea22f701ed1cc5bbd7d3 Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Sat, 20 Apr 2024 19:31:00 +0800 Subject: refactor: add xmemcpyz() and use it in place of some xstrlcpy() (#28422) Problem: Using xstrlcpy() when the exact length of the string to be copied is known is not ideal because it requires adding 1 to the length and an unnecessary strlen(). Solution: Add xmemcpyz() and use it in place of such xstrlcpy() calls. --- src/nvim/regexp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 0ce911c91a..e923449abe 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -2379,7 +2379,7 @@ char *reg_submatch(int no) // Within one line: take form start to end col. len = rsm.sm_mmatch->endpos[no].col - rsm.sm_mmatch->startpos[no].col; if (round == 2) { - xstrlcpy(retval, s, (size_t)len + 1); + xmemcpyz(retval, s, (size_t)len); } len++; } else { -- cgit From 7acf39ddab8ebdb63ebf78ec980149d20783fd4b Mon Sep 17 00:00:00 2001 From: dundargoc <33953936+dundargoc@users.noreply.github.com> Date: Wed, 15 May 2024 01:18:33 +0200 Subject: docs: misc (#28609) Closes https://github.com/neovim/neovim/issues/28484. Closes https://github.com/neovim/neovim/issues/28719. Co-authored-by: Chris Co-authored-by: Gregory Anders Co-authored-by: Jake B <16889000+jakethedev@users.noreply.github.com> Co-authored-by: Jonathan Raines Co-authored-by: Yi Ming Co-authored-by: Zane Dufour Co-authored-by: zeertzjq --- src/nvim/regexp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index e923449abe..77724b629d 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -10555,7 +10555,7 @@ nfa_do_multibyte: // NFA_END_COMPOSING is the ). Note that right now we are // building the postfix form, not the NFA itself; // a composing char could be: a, b, c, NFA_COMPOSING - // where 'b' and 'c' are chars with codes > 256. */ + // where 'b' and 'c' are chars with codes > 256. while (true) { EMIT(c); if (i > 0) { -- cgit From 0e187fe038c76e822353e7fb0fd96e860e5ab3ef Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Mon, 20 May 2024 12:42:57 +0800 Subject: vim-patch:9.1.0409: too many strlen() calls in the regexp engine (#28857) Problem: too many strlen() calls in the regexp engine Solution: refactor code to retrieve strlen differently, make use of bsearch() for getting the character class (John Marriott) closes: vim/vim#14648 https://github.com/vim/vim/commit/82792db6315f7c7b0e299cdde1566f2932a463f8 Cherry-pick keyvalue_T and its comparison functions from patch 9.1.0256. vim-patch:9.1.0410: warning about uninitialized variable vim-patch:9.1.0412: typo in regexp_bt.c in DEBUG code Co-authored-by: John Marriott --- src/nvim/regexp.c | 405 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 266 insertions(+), 139 deletions(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 77724b629d..1027ce05b6 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -45,6 +45,12 @@ #include "nvim/types_defs.h" #include "nvim/vim_defs.h" +typedef enum { + RGLF_LINE = 0x01, + RGLF_LENGTH = 0x02, + RGLF_SUBMATCH = 0x04, +} reg_getline_flags_T; + enum { /// In the NFA engine: how many braces are allowed. /// TODO(RE): Use dynamic memory allocation instead of static, like here @@ -386,6 +392,7 @@ static int re_multi_type(int c) } static char *reg_prev_sub = NULL; +static size_t reg_prev_sublen = 0; // REGEXP_INRANGE contains all characters which are always special in a [] // range after '\'. @@ -420,60 +427,82 @@ static int backslash_trans(int c) return c; } +enum { + CLASS_ALNUM = 0, + CLASS_ALPHA, + CLASS_BLANK, + CLASS_CNTRL, + CLASS_DIGIT, + CLASS_GRAPH, + CLASS_LOWER, + CLASS_PRINT, + CLASS_PUNCT, + CLASS_SPACE, + CLASS_UPPER, + CLASS_XDIGIT, + CLASS_TAB, + CLASS_RETURN, + CLASS_BACKSPACE, + CLASS_ESCAPE, + CLASS_IDENT, + CLASS_KEYWORD, + CLASS_FNAME, + CLASS_NONE = 99, +}; + /// Check for a character class name "[:name:]". "pp" points to the '['. /// Returns one of the CLASS_ items. CLASS_NONE means that no item was /// recognized. Otherwise "pp" is advanced to after the item. static int get_char_class(char **pp) { - static const char *(class_names[]) = { - "alnum:]", -#define CLASS_ALNUM 0 - "alpha:]", -#define CLASS_ALPHA 1 - "blank:]", -#define CLASS_BLANK 2 - "cntrl:]", -#define CLASS_CNTRL 3 - "digit:]", -#define CLASS_DIGIT 4 - "graph:]", -#define CLASS_GRAPH 5 - "lower:]", -#define CLASS_LOWER 6 - "print:]", -#define CLASS_PRINT 7 - "punct:]", -#define CLASS_PUNCT 8 - "space:]", -#define CLASS_SPACE 9 - "upper:]", -#define CLASS_UPPER 10 - "xdigit:]", -#define CLASS_XDIGIT 11 - "tab:]", -#define CLASS_TAB 12 - "return:]", -#define CLASS_RETURN 13 - "backspace:]", -#define CLASS_BACKSPACE 14 - "escape:]", -#define CLASS_ESCAPE 15 - "ident:]", -#define CLASS_IDENT 16 - "keyword:]", -#define CLASS_KEYWORD 17 - "fname:]", -#define CLASS_FNAME 18 + // must be sorted by the 'value' field because it is used by bsearch()! + static keyvalue_T char_class_tab[] = { + KEYVALUE_ENTRY(CLASS_ALNUM, "alnum:]"), + KEYVALUE_ENTRY(CLASS_ALPHA, "alpha:]"), + KEYVALUE_ENTRY(CLASS_BACKSPACE, "backspace:]"), + KEYVALUE_ENTRY(CLASS_BLANK, "blank:]"), + KEYVALUE_ENTRY(CLASS_CNTRL, "cntrl:]"), + KEYVALUE_ENTRY(CLASS_DIGIT, "digit:]"), + KEYVALUE_ENTRY(CLASS_ESCAPE, "escape:]"), + KEYVALUE_ENTRY(CLASS_FNAME, "fname:]"), + KEYVALUE_ENTRY(CLASS_GRAPH, "graph:]"), + KEYVALUE_ENTRY(CLASS_IDENT, "ident:]"), + KEYVALUE_ENTRY(CLASS_KEYWORD, "keyword:]"), + KEYVALUE_ENTRY(CLASS_LOWER, "lower:]"), + KEYVALUE_ENTRY(CLASS_PRINT, "print:]"), + KEYVALUE_ENTRY(CLASS_PUNCT, "punct:]"), + KEYVALUE_ENTRY(CLASS_RETURN, "return:]"), + KEYVALUE_ENTRY(CLASS_SPACE, "space:]"), + KEYVALUE_ENTRY(CLASS_TAB, "tab:]"), + KEYVALUE_ENTRY(CLASS_UPPER, "upper:]"), + KEYVALUE_ENTRY(CLASS_XDIGIT, "xdigit:]") }; -#define CLASS_NONE 99 - int i; - if ((*pp)[1] == ':') { - for (i = 0; i < (int)ARRAY_SIZE(class_names); i++) { - if (strncmp(*pp + 2, class_names[i], strlen(class_names[i])) == 0) { - *pp += strlen(class_names[i]) + 2; - return i; - } + // check that the value of "pp" has a chance of matching + if ((*pp)[1] == ':' && ASCII_ISLOWER((*pp)[2]) + && ASCII_ISLOWER((*pp)[3]) && ASCII_ISLOWER((*pp)[4])) { + // this function can be called repeatedly with the same value for "pp" + // so we cache the last found entry. + static keyvalue_T *last_entry = NULL; + + keyvalue_T target = { + .key = 0, + .value = *pp + 2, + .length = 0, // not used, see cmp_keyvalue_value_n() + }; + + keyvalue_T *entry; + if (last_entry != NULL && cmp_keyvalue_value_n(&target, last_entry) == 0) { + entry = last_entry; + } else { + entry = (keyvalue_T *)bsearch(&target, &char_class_tab, + ARRAY_SIZE(char_class_tab), + sizeof(char_class_tab[0]), cmp_keyvalue_value_n); + } + if (entry != NULL) { + last_entry = entry; + *pp += entry->length + 2; + return entry->key; } } return CLASS_NONE; @@ -764,16 +793,18 @@ char *skip_regexp_ex(char *startp, int dirc, int magic, char **newp, int *droppe break; } } else if (p[0] == '\\' && p[1] != NUL) { + size_t startplen = 0; if (dirc == '?' && newp != NULL && p[1] == '?') { // change "\?" to "?", make a copy first. if (*newp == NULL) { - *newp = xstrdup(startp); + startplen = strlen(startp); + *newp = xstrnsave(startp, startplen); p = *newp + (p - startp); } if (dropped != NULL) { (*dropped)++; } - STRMOVE(p, p + 1); + memmove(p, p + 1, (startplen - (size_t)((p + 1) - *newp)) + 1); } else { p++; // skip next character } @@ -1264,19 +1295,89 @@ static bool reg_iswordc(int c) return vim_iswordc_buf(c, rex.reg_buf); } -// Get pointer to the line "lnum", which is relative to "reg_firstlnum". -static char *reg_getline(linenr_T lnum) -{ - // when looking behind for a match/no-match lnum is negative. But we - // can't go before line 1 - if (rex.reg_firstlnum + lnum < 1) { - return NULL; +static bool can_f_submatch = false; ///< true when submatch() can be used + +/// These pointers are used for reg_submatch(). Needed for when the +/// substitution string is an expression that contains a call to substitute() +/// and submatch(). +typedef struct { + regmatch_T *sm_match; + regmmatch_T *sm_mmatch; + linenr_T sm_firstlnum; + linenr_T sm_maxline; + int sm_line_lbr; +} regsubmatch_T; + +static regsubmatch_T rsm; ///< can only be used when can_f_submatch is true + +/// Common code for reg_getline(), reg_getline_len(), reg_getline_submatch() and +/// reg_getline_submatch_len(). +/// +/// @param flags a bitmask that controls what info is to be returned +/// and whether or not submatch is in effect. +static void reg_getline_common(linenr_T lnum, reg_getline_flags_T flags, char **line, + colnr_T *length) +{ + bool get_line = flags & RGLF_LINE; + bool get_length = flags & RGLF_LENGTH; + linenr_T firstlnum; + linenr_T maxline; + + if (flags & RGLF_SUBMATCH) { + firstlnum = rsm.sm_firstlnum + lnum; + maxline = rsm.sm_maxline; + } else { + firstlnum = rex.reg_firstlnum + lnum; + maxline = rex.reg_maxline; } - if (lnum > rex.reg_maxline) { - // Must have matched the "\n" in the last line. - return ""; + + // when looking behind for a match/no-match lnum is negative. but we + // can't go before line 1. + if (firstlnum < 1) { + if (get_line) { + *line = NULL; + } + if (get_length) { + *length = 0; + } + + return; + } + + if (lnum > maxline) { + // must have matched the "\n" in the last line. + if (get_line) { + *line = ""; + } + if (get_length) { + *length = 0; + } + + return; + } + + if (get_line) { + *line = ml_get_buf(rex.reg_buf, firstlnum); + } + if (get_length) { + *length = ml_get_buf_len(rex.reg_buf, firstlnum); } - return ml_get_buf(rex.reg_buf, rex.reg_firstlnum + lnum); +} + +/// Get pointer to the line "lnum", which is relative to "reg_firstlnum". +static char *reg_getline(linenr_T lnum) +{ + char *line; + reg_getline_common(lnum, RGLF_LINE, &line, NULL); + return line; +} + +/// Get length of line "lnum", which is relative to "reg_firstlnum". +static colnr_T reg_getline_len(linenr_T lnum) +{ + colnr_T length; + reg_getline_common(lnum, RGLF_LENGTH, NULL, &length); + return length; } static uint8_t *reg_startzp[NSUBEXP]; // Workspace to mark beginning @@ -1510,7 +1611,7 @@ static int match_with_backref(linenr_T start_lnum, colnr_T start_col, linenr_T e if (clnum == end_lnum) { len = end_col - ccol; } else { - len = (int)strlen(p + ccol); + len = reg_getline_len(clnum) - ccol; } if (cstrncmp(p + ccol, (char *)rex.input, &len) != 0) { @@ -1746,42 +1847,58 @@ static void do_lower(int *d, int c) char *regtilde(char *source, int magic, bool preview) { char *newsub = source; + size_t newsublen = 0; + char tilde[3] = { '~', NUL, NUL }; + size_t tildelen = 1; + bool error = false; + + if (!magic) { + tilde[0] = '\\'; + tilde[1] = '~'; + tilde[2] = NUL; + tildelen = 2; + } + + char *p; + for (p = newsub; *p; p++) { + if (strncmp(p, tilde, tildelen) == 0) { + size_t prefixlen = (size_t)(p - newsub); // not including the tilde + char *postfix = p + tildelen; + size_t postfixlen; + size_t tmpsublen; - for (char *p = newsub; *p; p++) { - if ((*p == '~' && magic) || (*p == '\\' && *(p + 1) == '~' && !magic)) { - if (reg_prev_sub != NULL) { - // length = len(newsub) - 1 + len(prev_sub) + 1 + if (newsublen == 0) { + newsublen = strlen(newsub); + } + newsublen -= tildelen; + postfixlen = newsublen - prefixlen; + tmpsublen = prefixlen + reg_prev_sublen + postfixlen; + + if (tmpsublen > 0 && reg_prev_sub != NULL) { // Avoid making the text longer than MAXCOL, it will cause // trouble at some point. - size_t prevsublen = strlen(reg_prev_sub); - size_t newsublen = strlen(newsub); - if (prevsublen > MAXCOL || newsublen > MAXCOL - || newsublen + prevsublen > MAXCOL) { + if (tmpsublen > MAXCOL) { emsg(_(e_resulting_text_too_long)); + error = true; break; } - char *tmpsub = xmalloc(newsublen + prevsublen); + char *tmpsub = xmalloc(tmpsublen + 1); // copy prefix - size_t prefixlen = (size_t)(p - newsub); // not including ~ memmove(tmpsub, newsub, prefixlen); // interpret tilde - memmove(tmpsub + prefixlen, reg_prev_sub, prevsublen); + memmove(tmpsub + prefixlen, reg_prev_sub, reg_prev_sublen); // copy postfix - if (!magic) { - p++; // back off backslash - } - STRCPY(tmpsub + prefixlen + prevsublen, p + 1); + STRCPY(tmpsub + prefixlen + reg_prev_sublen, postfix); if (newsub != source) { // allocated newsub before xfree(newsub); } newsub = tmpsub; - p = newsub + prefixlen + prevsublen; - } else if (magic) { - STRMOVE(p, p + 1); // remove '~' + newsublen = tmpsublen; + p = newsub + prefixlen + reg_prev_sublen; } else { - STRMOVE(p, p + 2); // remove '\~' + memmove(p, postfix, postfixlen + 1); // remove the tilde (+1 for the NUL) } p--; } else { @@ -1792,32 +1909,31 @@ char *regtilde(char *source, int magic, bool preview) } } + if (error) { + if (newsub != source) { + xfree(newsub); + } + return source; + } + // Only change reg_prev_sub when not previewing. if (!preview) { // Store a copy of newsub in reg_prev_sub. It is always allocated, // because recursive calls may make the returned string invalid. - xfree(reg_prev_sub); - reg_prev_sub = xstrdup(newsub); + // Only store it if there something to store. + newsublen = (size_t)(p - newsub); + if (newsublen == 0) { + XFREE_CLEAR(reg_prev_sub); + } else { + xfree(reg_prev_sub); + reg_prev_sub = xstrnsave(newsub, newsublen); + } + reg_prev_sublen = newsublen; } return newsub; } -static bool can_f_submatch = false; // true when submatch() can be used - -// These pointers are used for reg_submatch(). Needed for when the -// substitution string is an expression that contains a call to substitute() -// and submatch(). -typedef struct { - regmatch_T *sm_match; - regmmatch_T *sm_mmatch; - linenr_T sm_firstlnum; - linenr_T sm_maxline; - int sm_line_lbr; -} regsubmatch_T; - -static regsubmatch_T rsm; // can only be used when can_f_submatch is true - /// Put the submatches in "argv[argskip]" which is a list passed into /// call_func() by vim_regsub_both(). static int fill_submatch_list(int argc FUNC_ATTR_UNUSED, typval_T *argv, int argskip, ufunc_T *fp) @@ -1979,11 +2095,13 @@ static int vim_regsub_both(char *source, typval_T *expr, char *dest, int destlen // "flags & REGSUB_COPY" == 0 to the call with // "flags & REGSUB_COPY" != 0. if (copy) { - size_t reslen = eval_result[nested] != NULL ? strlen(eval_result[nested]) : 0; - if (eval_result[nested] != NULL && reslen < (size_t)destlen) { - STRCPY(dest, eval_result[nested]); - dst += reslen; - XFREE_CLEAR(eval_result[nested]); + if (eval_result[nested] != NULL) { + size_t eval_len = strlen(eval_result[nested]); + if (eval_len < (size_t)destlen) { + STRCPY(dest, eval_result[nested]); + dst += eval_len; + XFREE_CLEAR(eval_result[nested]); + } } } else { const bool prev_can_f_submatch = can_f_submatch; @@ -2218,7 +2336,7 @@ static int vim_regsub_both(char *source, typval_T *expr, char *dest, int destlen len = rex.reg_mmatch->endpos[no].col - rex.reg_mmatch->startpos[no].col; } else { - len = (int)strlen(s); + len = reg_getline_len(clnum) - rex.reg_mmatch->startpos[no].col; } } } else { @@ -2248,7 +2366,7 @@ static int vim_regsub_both(char *source, typval_T *expr, char *dest, int destlen if (rex.reg_mmatch->endpos[no].lnum == clnum) { len = rex.reg_mmatch->endpos[no].col; } else { - len = (int)strlen(s); + len = reg_getline_len(clnum); } } else { break; @@ -2325,23 +2443,18 @@ exit: return (int)((dst - dest) + 1); } -/// Call reg_getline() with the line numbers from the submatch. If a -/// substitute() was used the reg_maxline and other values have been -/// overwritten. static char *reg_getline_submatch(linenr_T lnum) { - char *s; - linenr_T save_first = rex.reg_firstlnum; - linenr_T save_max = rex.reg_maxline; - - rex.reg_firstlnum = rsm.sm_firstlnum; - rex.reg_maxline = rsm.sm_maxline; - - s = reg_getline(lnum); + char *line; + reg_getline_common(lnum, RGLF_LINE | RGLF_SUBMATCH, &line, NULL); + return line; +} - rex.reg_firstlnum = save_first; - rex.reg_maxline = save_max; - return s; +static colnr_T reg_getline_submatch_len(linenr_T lnum) +{ + colnr_T length; + reg_getline_common(lnum, RGLF_LENGTH | RGLF_SUBMATCH, NULL, &length); + return length; } /// Used for the submatch() function: get the string from the n'th submatch in @@ -2385,7 +2498,7 @@ char *reg_submatch(int no) } else { // Multiple lines: take start line from start col, middle // lines completely and end line up to end col. - len = (ssize_t)strlen(s); + len = reg_getline_submatch_len(lnum) - rsm.sm_mmatch->startpos[no].col; if (round == 2) { STRCPY(retval, s); retval[len] = '\n'; @@ -2393,15 +2506,16 @@ char *reg_submatch(int no) len++; lnum++; while (lnum < rsm.sm_mmatch->endpos[no].lnum) { - s = reg_getline_submatch(lnum++); + s = reg_getline_submatch(lnum); if (round == 2) { STRCPY(retval + len, s); } - len += (ssize_t)strlen(s); + len += reg_getline_submatch_len(lnum); if (round == 2) { retval[len] = '\n'; } len++; + lnum++; } if (round == 2) { strncpy(retval + len, // NOLINT(runtime/printf) @@ -2463,8 +2577,9 @@ list_T *reg_submatch_list(int no) if (slnum == elnum) { tv_list_append_string(list, s, ecol - scol); } else { + int max_lnum = elnum - slnum; tv_list_append_string(list, s, -1); - for (int i = 1; i < elnum - slnum; i++) { + for (int i = 1; i < max_lnum; i++) { s = reg_getline_submatch(slnum + i); tv_list_append_string(list, s, -1); } @@ -5357,9 +5472,12 @@ static regprog_T *bt_regcomp(uint8_t *expr, int re_flags) longest = NULL; len = 0; for (; scan != NULL; scan = regnext(scan)) { - if (OP(scan) == EXACTLY && strlen((char *)OPERAND(scan)) >= (size_t)len) { - longest = OPERAND(scan); - len = (int)strlen((char *)OPERAND(scan)); + if (OP(scan) == EXACTLY) { + size_t scanlen = strlen((char *)OPERAND(scan)); + if (scanlen >= (size_t)len) { + longest = OPERAND(scan); + len = (int)scanlen; + } } } r->regmust = longest; @@ -6091,7 +6209,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) pos = &fm->mark; const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum && pos->col == MAXCOL - ? (colnr_T)strlen(reg_getline(pos->lnum - rex.reg_firstlnum)) + ? reg_getline_len(pos->lnum - rex.reg_firstlnum) : pos->col; if (pos->lnum == rex.lnum + rex.reg_firstlnum @@ -7237,7 +7355,7 @@ static bool regmatch(uint8_t *scan, const proftime_T *tm, int *timed_out) if (rex.line == NULL) { break; } - rex.input = rex.line + strlen((char *)rex.line); + rex.input = rex.line + reg_getline_len(rex.lnum); reg_breakcheck(); } else { MB_PTR_BACK(rex.line, rex.input); @@ -7735,8 +7853,10 @@ static uint8_t *regprop(uint8_t *op) { char *p; static char buf[50]; + static size_t buflen = 0; STRCPY(buf, ":"); + buflen = 1; switch ((int)OP(op)) { case BOL: @@ -7976,7 +8096,8 @@ static uint8_t *regprop(uint8_t *op) case MOPEN + 7: case MOPEN + 8: case MOPEN + 9: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "MOPEN%d", OP(op) - MOPEN); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "MOPEN%d", OP(op) - MOPEN); p = NULL; break; case MCLOSE + 0: @@ -7991,7 +8112,8 @@ static uint8_t *regprop(uint8_t *op) case MCLOSE + 7: case MCLOSE + 8: case MCLOSE + 9: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "MCLOSE%d", OP(op) - MCLOSE); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "MCLOSE%d", OP(op) - MCLOSE); p = NULL; break; case BACKREF + 1: @@ -8003,7 +8125,8 @@ static uint8_t *regprop(uint8_t *op) case BACKREF + 7: case BACKREF + 8: case BACKREF + 9: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "BACKREF%d", OP(op) - BACKREF); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "BACKREF%d", OP(op) - BACKREF); p = NULL; break; case NOPEN: @@ -8021,7 +8144,8 @@ static uint8_t *regprop(uint8_t *op) case ZOPEN + 7: case ZOPEN + 8: case ZOPEN + 9: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "ZOPEN%d", OP(op) - ZOPEN); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "ZOPEN%d", OP(op) - ZOPEN); p = NULL; break; case ZCLOSE + 1: @@ -8033,7 +8157,8 @@ static uint8_t *regprop(uint8_t *op) case ZCLOSE + 7: case ZCLOSE + 8: case ZCLOSE + 9: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "ZCLOSE%d", OP(op) - ZCLOSE); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "ZCLOSE%d", OP(op) - ZCLOSE); p = NULL; break; case ZREF + 1: @@ -8045,7 +8170,8 @@ static uint8_t *regprop(uint8_t *op) case ZREF + 7: case ZREF + 8: case ZREF + 9: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "ZREF%d", OP(op) - ZREF); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "ZREF%d", OP(op) - ZREF); p = NULL; break; case STAR: @@ -8085,8 +8211,8 @@ static uint8_t *regprop(uint8_t *op) case BRACE_COMPLEX + 7: case BRACE_COMPLEX + 8: case BRACE_COMPLEX + 9: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "BRACE_COMPLEX%d", - OP(op) - BRACE_COMPLEX); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "BRACE_COMPLEX%d", OP(op) - BRACE_COMPLEX); p = NULL; break; case MULTIBYTECODE: @@ -8096,12 +8222,13 @@ static uint8_t *regprop(uint8_t *op) p = "NEWL"; break; default: - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "corrupt %d", OP(op)); + buflen += (size_t)snprintf(buf + buflen, sizeof(buf) - buflen, + "corrupt %d", OP(op)); p = NULL; break; } if (p != NULL) { - STRCAT(buf, p); + STRCPY(buf + buflen, p); } return (uint8_t *)buf; } @@ -13599,7 +13726,7 @@ static int recursive_regmatch(nfa_state_T *state, nfa_pim_T *pim, nfa_regprog_T rex.line = (uint8_t *)reg_getline(++rex.lnum); rex.input = rex.line; } else { - rex.input = rex.line + strlen((char *)rex.line); + rex.input = rex.line + reg_getline_len(rex.lnum); } } if ((int)(rex.input - rex.line) >= state->val) { @@ -14997,7 +15124,7 @@ static int nfa_regmatch(nfa_regprog_T *prog, nfa_state_T *start, regsubs_T *subm pos_T *pos = &fm->mark; const colnr_T pos_col = pos->lnum == rex.lnum + rex.reg_firstlnum && pos->col == MAXCOL - ? (colnr_T)strlen(reg_getline(pos->lnum - rex.reg_firstlnum)) + ? reg_getline_len(pos->lnum - rex.reg_firstlnum) : pos->col; result = pos->lnum == rex.lnum + rex.reg_firstlnum -- cgit From e7859d2ad504a3e3cae1d540d5fd4f9b560d154a Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Fri, 24 May 2024 05:57:00 +0800 Subject: vim-patch:9.1.0436: Crash when using '?' as separator for :s (#28955) Problem: Crash when using '?' as separator for :s and pattern contains escaped '?'s (after 9.1.0409). Solution: Always compute startplen. (zeertzjq). related: neovim/neovim#28935 closes: 14832 https://github.com/vim/vim/commit/789679cfc4f39505b135220672b43a260d8ca3b4 --- src/nvim/regexp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index 1027ce05b6..fa6e577c74 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -774,6 +774,7 @@ char *skip_regexp_ex(char *startp, int dirc, int magic, char **newp, int *droppe { magic_T mymagic; char *p = startp; + size_t startplen = strlen(startp); if (magic) { mymagic = MAGIC_ON; @@ -793,11 +794,9 @@ char *skip_regexp_ex(char *startp, int dirc, int magic, char **newp, int *droppe break; } } else if (p[0] == '\\' && p[1] != NUL) { - size_t startplen = 0; if (dirc == '?' && newp != NULL && p[1] == '?') { // change "\?" to "?", make a copy first. if (*newp == NULL) { - startplen = strlen(startp); *newp = xstrnsave(startp, startplen); p = *newp + (p - startp); } -- cgit From c836383d21b6d38ecf59e46e76da55ca97a4fc65 Mon Sep 17 00:00:00 2001 From: zeertzjq Date: Fri, 24 May 2024 15:04:33 +0800 Subject: vim-patch:9.1.0438: Wrong Ex command executed when :g uses '?' as delimiter (#28956) Problem: Wrong Ex command executed when :g uses '?' as delimiter and pattern contains escaped '?'. Solution: Don't use "*newp" when it's not allocated (zeertzjq). closes: vim/vim#14837 https://github.com/vim/vim/commit/3074137542961ce7b3b65c14ebde75f13f5e6147 --- src/nvim/regexp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'src/nvim/regexp.c') diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index fa6e577c74..5600d6a2f8 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -774,7 +774,7 @@ char *skip_regexp_ex(char *startp, int dirc, int magic, char **newp, int *droppe { magic_T mymagic; char *p = startp; - size_t startplen = strlen(startp); + size_t startplen = 0; if (magic) { mymagic = MAGIC_ON; @@ -796,14 +796,18 @@ char *skip_regexp_ex(char *startp, int dirc, int magic, char **newp, int *droppe } else if (p[0] == '\\' && p[1] != NUL) { if (dirc == '?' && newp != NULL && p[1] == '?') { // change "\?" to "?", make a copy first. + if (startplen == 0) { + startplen = strlen(startp); + } if (*newp == NULL) { *newp = xstrnsave(startp, startplen); p = *newp + (p - startp); + startp = *newp; } if (dropped != NULL) { (*dropped)++; } - memmove(p, p + 1, (startplen - (size_t)((p + 1) - *newp)) + 1); + memmove(p, p + 1, startplen - (size_t)((p + 1) - startp) + 1); } else { p++; // skip next character } -- cgit