diff options
author | KunMing Xie <qqzz014@gmail.com> | 2018-06-02 01:57:22 +0800 |
---|---|---|
committer | Justin M. Keyes <justinkz@gmail.com> | 2018-06-01 19:57:22 +0200 |
commit | 49a497a67c92f339ff9ce2939188b651e250367b (patch) | |
tree | 6774ff316bd501ba21169b59cc5dbd0759376964 | |
parent | c7350f542ade5ea4f19e490ef3638fbf8cf6db41 (diff) | |
download | rneovim-49a497a67c92f339ff9ce2939188b651e250367b.tar.gz rneovim-49a497a67c92f339ff9ce2939188b651e250367b.tar.bz2 rneovim-49a497a67c92f339ff9ce2939188b651e250367b.zip |
vim-patch:8.0.0519: character classes not well tested (#8460)
Problem: Character classes are not well tested. They can differ between
platforms.
Solution: Add tests. In the documentation make clear which classes depend
on what library function. Only use :cntrl: and :graph: for ASCII.
(Kazunobu Kuriyama, Dominique Pelle, closes vim/vim#1560)
Update the documentation.
https://github.com/vim/vim/commit/0c078fc7db2902d4ccba04506db082ddbef45a8c
-rw-r--r-- | runtime/doc/pattern.txt | 43 | ||||
-rw-r--r-- | src/nvim/regexp.c | 6 | ||||
-rw-r--r-- | src/nvim/regexp_nfa.c | 6 | ||||
-rw-r--r-- | src/nvim/testdir/test_regexp_utf8.vim | 67 |
4 files changed, 90 insertions, 32 deletions
diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt index ab78b8b71c..cc485b655d 100644 --- a/runtime/doc/pattern.txt +++ b/runtime/doc/pattern.txt @@ -1071,25 +1071,27 @@ x A single character, with no special meaning, matches itself - A character class expression is evaluated to the set of characters belonging to that character class. The following character classes are supported: - Name Contents ~ -*[:alnum:]* [:alnum:] ASCII letters and digits -*[:alpha:]* [:alpha:] ASCII letters -*[:blank:]* [:blank:] space and tab characters -*[:cntrl:]* [:cntrl:] control characters -*[:digit:]* [:digit:] decimal digits -*[:graph:]* [:graph:] printable characters excluding space -*[:lower:]* [:lower:] lowercase letters (all letters when + Name Func Contents ~ +*[:alnum:]* [:alnum:] isalnum ASCII letters and digits +*[:alpha:]* [:alpha:] isalpha ASCII letters +*[:blank:]* [:blank:] space and tab +*[:cntrl:]* [:cntrl:] iscntrl ASCII control characters +*[:digit:]* [:digit:] decimal digits '0' to '9' +*[:graph:]* [:graph:] isgraph ASCII printable characters excluding + space +*[:lower:]* [:lower:] (1) lowercase letters (all letters when 'ignorecase' is used) -*[:print:]* [:print:] printable characters including space -*[:punct:]* [:punct:] ASCII punctuation characters -*[:space:]* [:space:] whitespace characters -*[:upper:]* [:upper:] uppercase letters (all letters when +*[:print:]* [:print:] (2) printable characters including space +*[:punct:]* [:punct:] ispunct ASCII punctuation characters +*[:space:]* [:space:] whitespace characters: space, tab, CR, + NL, vertical tab, form feed +*[:upper:]* [:upper:] (3) uppercase letters (all letters when 'ignorecase' is used) -*[:xdigit:]* [:xdigit:] hexadecimal digits -*[:return:]* [:return:] the <CR> character -*[:tab:]* [:tab:] the <Tab> character -*[:escape:]* [:escape:] the <Esc> character -*[:backspace:]* [:backspace:] the <BS> character +*[:xdigit:]* [:xdigit:] hexadecimal digits: 0-9, a-f, A-F +*[:return:]* [:return:] the <CR> character +*[:tab:]* [:tab:] the <Tab> character +*[:escape:]* [:escape:] the <Esc> character +*[:backspace:]* [:backspace:] the <BS> character The brackets in character class expressions are additional to the brackets delimiting a collection. For example, the following is a plausible pattern for a Unix filename: "[-./[:alnum:]_~]\+" That is, @@ -1100,6 +1102,13 @@ x A single character, with no special meaning, matches itself regexp engine. See |two-engines|. In the future these items may work for multi-byte characters. For now, to get all "alpha" characters you can use: [[:lower:][:upper:]]. + + The "Func" column shows what library function is used. The + implementation depends on the system. Otherwise: + (1) Uses islower() for ASCII and Vim builtin rules for other + characters when built with the |+multi_byte| feature. + (2) Uses Vim builtin rules + (3) As with (1) but using isupper() */[[=* *[==]* - An equivalence class. This means that characters are matched that have almost the same meaning, e.g., when ignoring accents. This diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c index e4de43b49e..ee7d6d8500 100644 --- a/src/nvim/regexp.c +++ b/src/nvim/regexp.c @@ -2328,21 +2328,21 @@ collection: regc('\t'); break; case CLASS_CNTRL: - for (cu = 1; cu <= 255; cu++) { + for (cu = 1; cu <= 127; cu++) { if (iscntrl(cu)) { regmbc(cu); } } break; case CLASS_DIGIT: - for (cu = 1; cu <= 255; cu++) { + for (cu = 1; cu <= 127; cu++) { if (ascii_isdigit(cu)) { regmbc(cu); } } break; case CLASS_GRAPH: - for (cu = 1; cu <= 255; cu++) { + for (cu = 1; cu <= 127; cu++) { if (isgraph(cu)) { regmbc(cu); } diff --git a/src/nvim/regexp_nfa.c b/src/nvim/regexp_nfa.c index 98fae858f6..0b8e979ca2 100644 --- a/src/nvim/regexp_nfa.c +++ b/src/nvim/regexp_nfa.c @@ -4358,16 +4358,18 @@ static int check_char_class(int class, int c) return OK; break; case NFA_CLASS_CNTRL: - if (c >= 1 && c <= 255 && iscntrl(c)) + if (c >= 1 && c <= 127 && iscntrl(c)) { return OK; + } break; case NFA_CLASS_DIGIT: if (ascii_isdigit(c)) return OK; break; case NFA_CLASS_GRAPH: - if (c >= 1 && c <= 255 && isgraph(c)) + if (c >= 1 && c <= 127 && isgraph(c)) { return OK; + } break; case NFA_CLASS_LOWER: if (mb_islower(c) && c != 170 && c != 186) { diff --git a/src/nvim/testdir/test_regexp_utf8.vim b/src/nvim/testdir/test_regexp_utf8.vim index a2f4286d4f..ecd686743e 100644 --- a/src/nvim/testdir/test_regexp_utf8.vim +++ b/src/nvim/testdir/test_regexp_utf8.vim @@ -35,12 +35,21 @@ func s:classes_test() set isprint=@,161-255 call assert_equal('Motörhead', matchstr('Motörhead', '[[:print:]]\+')) + let alnumchars = '' let alphachars = '' + let backspacechar = '' + let blankchars = '' + let cntrlchars = '' + let digitchars = '' + let escapechar = '' + let graphchars = '' let lowerchars = '' - let upperchars = '' - let alnumchars = '' let printchars = '' let punctchars = '' + let returnchar = '' + let spacechars = '' + let tabchar = '' + let upperchars = '' let xdigitchars = '' let i = 1 while i <= 255 @@ -48,21 +57,48 @@ func s:classes_test() if c =~ '[[:alpha:]]' let alphachars .= c endif - if c =~ '[[:lower:]]' - let lowerchars .= c - endif - if c =~ '[[:upper:]]' - let upperchars .= c - endif if c =~ '[[:alnum:]]' let alnumchars .= c endif + if c =~ '[[:backspace:]]' + let backspacechar .= c + endif + if c =~ '[[:blank:]]' + let blankchars .= c + endif + if c =~ '[[:cntrl:]]' + let cntrlchars .= c + endif + if c =~ '[[:digit:]]' + let digitchars .= c + endif + if c =~ '[[:escape:]]' + let escapechar .= c + endif + if c =~ '[[:graph:]]' + let graphchars .= c + endif + if c =~ '[[:lower:]]' + let lowerchars .= c + endif if c =~ '[[:print:]]' let printchars .= c endif if c =~ '[[:punct:]]' let punctchars .= c endif + if c =~ '[[:return:]]' + let returnchar .= c + endif + if c =~ '[[:space:]]' + let spacechars .= c + endif + if c =~ '[[:tab:]]' + let tabchar .= c + endif + if c =~ '[[:upper:]]' + let upperchars .= c + endif if c =~ '[[:xdigit:]]' let xdigitchars .= c endif @@ -70,11 +106,22 @@ func s:classes_test() endwhile call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alphachars) - call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars) - call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars) call assert_equal('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alnumchars) + call assert_equal("\b", backspacechar) + call assert_equal("\t ", blankchars) + " Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis. + " call assert_equal("\x01\x02\x03\x04\x05\x06\x07\b\t\n\x0b\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\e\x1c\x1d\x1e\x1f\x7f", cntrlchars) + call assert_equal("0123456789", digitchars) + call assert_equal("\<Esc>", escapechar) + " Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis. + " call assert_equal('!"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~', graphchars) + call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars) call assert_equal(' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', printchars) call assert_equal('!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~', punctchars) + call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars) + call assert_equal("\r", returnchar) + call assert_equal("\t\n\x0b\f\r ", spacechars) + call assert_equal("\t", tabchar) call assert_equal('0123456789ABCDEFabcdef', xdigitchars) endfunc |