vim-patch:8.0.0519: character classes not well tested (#8460)

Problem: Character classes are not well tested. They can differ between platforms. Solution: Add tests. In the documentation make clear which classes depend on what library function. Only use :cntrl: and :graph: for ASCII. (Kazunobu Kuriyama, Dominique Pelle, closes vim/vim#1560) Update the documentation. https://github.com/vim/vim/commit/0c078fc7db2902d4ccba04506db082ddbef45a8c
author: KunMing Xie <qqzz014@gmail.com> 2018-06-02 01:57:22 +0800
committer: Justin M. Keyes <justinkz@gmail.com> 2018-06-01 19:57:22 +0200
commit: 49a497a67c92f339ff9ce2939188b651e250367b (patch)
tree: 6774ff316bd501ba21169b59cc5dbd0759376964
parent: c7350f542ade5ea4f19e490ef3638fbf8cf6db41 (diff)
download: rneovim-49a497a67c92f339ff9ce2939188b651e250367b.tar.gz
rneovim-49a497a67c92f339ff9ce2939188b651e250367b.tar.bz2
rneovim-49a497a67c92f339ff9ce2939188b651e250367b.zip
4 files changed, 90 insertions, 32 deletions
diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt
index ab78b8b71c..cc485b655d 100644
--- a/runtime/doc/pattern.txt
+++ b/runtime/doc/pattern.txt
@@ -1071,25 +1071,27 @@ x	A single character, with no special meaning, matches itself
 	- A character class expression is evaluated to the set of characters
 	  belonging to that character class.  The following character classes
 	  are supported:
-			  Name		Contents ~
-*[:alnum:]*		  [:alnum:]     ASCII letters and digits
-*[:alpha:]*		  [:alpha:]     ASCII letters
-*[:blank:]*		  [:blank:]     space and tab characters
-*[:cntrl:]*		  [:cntrl:]     control characters
-*[:digit:]*		  [:digit:]     decimal digits
-*[:graph:]*		  [:graph:]     printable characters excluding space
-*[:lower:]*		  [:lower:]     lowercase letters (all letters when
+		  Name	      Func	Contents ~
+*[:alnum:]*	  [:alnum:]   isalnum	ASCII letters and digits
+*[:alpha:]*	  [:alpha:]   isalpha  	ASCII letters
+*[:blank:]*	  [:blank:]     	space and tab
+*[:cntrl:]*	  [:cntrl:]   iscntrl 	ASCII control characters
+*[:digit:]*	  [:digit:]     	decimal digits '0' to '9'
+*[:graph:]*	  [:graph:]   isgraph	ASCII printable characters excluding
+					space
+*[:lower:]*	  [:lower:]   (1)	lowercase letters (all letters when
 					'ignorecase' is used)
-*[:print:]*		  [:print:]     printable characters including space
-*[:punct:]*		  [:punct:]     ASCII punctuation characters
-*[:space:]*		  [:space:]     whitespace characters
-*[:upper:]*		  [:upper:]     uppercase letters (all letters when
+*[:print:]*	  [:print:]   (2) 	printable characters including space
+*[:punct:]*	  [:punct:]   ispunct	ASCII punctuation characters
+*[:space:]*	  [:space:]     	whitespace characters: space, tab, CR,
+					NL, vertical tab, form feed
+*[:upper:]*	  [:upper:]   (3)	uppercase letters (all letters when
 					'ignorecase' is used)
-*[:xdigit:]*		  [:xdigit:]    hexadecimal digits
-*[:return:]*		  [:return:]	the <CR> character
-*[:tab:]*		  [:tab:]	the <Tab> character
-*[:escape:]*		  [:escape:]	the <Esc> character
-*[:backspace:]*		  [:backspace:]	the <BS> character
+*[:xdigit:]*	  [:xdigit:]    	hexadecimal digits: 0-9, a-f, A-F
+*[:return:]*	  [:return:]		the <CR> character
+*[:tab:]*	  [:tab:]		the <Tab> character
+*[:escape:]*	  [:escape:]		the <Esc> character
+*[:backspace:]*	  [:backspace:]		the <BS> character
 	  The brackets in character class expressions are additional to the
 	  brackets delimiting a collection.  For example, the following is a
 	  plausible pattern for a Unix filename: "[-./[:alnum:]_~]\+" That is,
@@ -1100,6 +1102,13 @@ x	A single character, with no special meaning, matches itself
 	  regexp engine.  See |two-engines|.  In the future these items may
 	  work for multi-byte characters.  For now, to get all "alpha"
 	  characters you can use: [[:lower:][:upper:]].
+
+	  The "Func" column shows what library function is used.  The
+	  implementation depends on the system.  Otherwise:
+	  (1) Uses islower() for ASCII and Vim builtin rules for other
+	  characters when built with the |+multi_byte| feature.
+	  (2) Uses Vim builtin rules
+	  (3) As with (1) but using isupper()
 							*/[[=* *[==]*
 	- An equivalence class.  This means that characters are matched that
 	  have almost the same meaning, e.g., when ignoring accents.  This
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index e4de43b49e..ee7d6d8500 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -2328,21 +2328,21 @@ collection:
               regc('\t');
               break;
             case CLASS_CNTRL:
-              for (cu = 1; cu <= 255; cu++) {
+              for (cu = 1; cu <= 127; cu++) {
                 if (iscntrl(cu)) {
                   regmbc(cu);
                 }
               }
               break;
             case CLASS_DIGIT:
-              for (cu = 1; cu <= 255; cu++) {
+              for (cu = 1; cu <= 127; cu++) {
                 if (ascii_isdigit(cu)) {
                   regmbc(cu);
                 }
               }
               break;
             case CLASS_GRAPH:
-              for (cu = 1; cu <= 255; cu++) {
+              for (cu = 1; cu <= 127; cu++) {
                 if (isgraph(cu)) {
                   regmbc(cu);
                 }
diff --git a/src/nvim/regexp_nfa.c b/src/nvim/regexp_nfa.c
index 98fae858f6..0b8e979ca2 100644
--- a/src/nvim/regexp_nfa.c
+++ b/src/nvim/regexp_nfa.c
@@ -4358,16 +4358,18 @@ static int check_char_class(int class, int c)
       return OK;
     break;
   case NFA_CLASS_CNTRL:
-    if (c >= 1 && c <= 255 && iscntrl(c))
+    if (c >= 1 && c <= 127 && iscntrl(c)) {
       return OK;
+    }
     break;
   case NFA_CLASS_DIGIT:
     if (ascii_isdigit(c))
       return OK;
     break;
   case NFA_CLASS_GRAPH:
-    if (c >= 1 && c <= 255 && isgraph(c))
+    if (c >= 1 && c <= 127 && isgraph(c)) {
       return OK;
+    }
     break;
   case NFA_CLASS_LOWER:
     if (mb_islower(c) && c != 170 && c != 186) {
diff --git a/src/nvim/testdir/test_regexp_utf8.vim b/src/nvim/testdir/test_regexp_utf8.vim
index a2f4286d4f..ecd686743e 100644
--- a/src/nvim/testdir/test_regexp_utf8.vim
+++ b/src/nvim/testdir/test_regexp_utf8.vim
@@ -35,12 +35,21 @@ func s:classes_test()
   set isprint=@,161-255
   call assert_equal('Motörhead', matchstr('Motörhead', '[[:print:]]\+'))
 
+  let alnumchars = ''
   let alphachars = ''
+  let backspacechar = ''
+  let blankchars = ''
+  let cntrlchars = ''
+  let digitchars = ''
+  let escapechar = ''
+  let graphchars = ''
   let lowerchars = ''
-  let upperchars = ''
-  let alnumchars = ''
   let printchars = ''
   let punctchars = ''
+  let returnchar = ''
+  let spacechars = ''
+  let tabchar = ''
+  let upperchars = ''
   let xdigitchars = ''
   let i = 1
   while i <= 255
@@ -48,21 +57,48 @@ func s:classes_test()
     if c =~ '[[:alpha:]]'
       let alphachars .= c
     endif
-    if c =~ '[[:lower:]]'
-      let lowerchars .= c
-    endif
-    if c =~ '[[:upper:]]'
-      let upperchars .= c
-    endif
     if c =~ '[[:alnum:]]'
       let alnumchars .= c
     endif
+    if c =~ '[[:backspace:]]'
+      let backspacechar .= c
+    endif
+    if c =~ '[[:blank:]]'
+      let blankchars .= c
+    endif
+    if c =~ '[[:cntrl:]]'
+      let cntrlchars .= c
+    endif
+    if c =~ '[[:digit:]]'
+      let digitchars .= c
+    endif
+    if c =~ '[[:escape:]]'
+      let escapechar .= c
+    endif
+    if c =~ '[[:graph:]]'
+      let graphchars .= c
+    endif
+    if c =~ '[[:lower:]]'
+      let lowerchars .= c
+    endif
     if c =~ '[[:print:]]'
       let printchars .= c
     endif
     if c =~ '[[:punct:]]'
       let punctchars .= c
     endif
+    if c =~ '[[:return:]]'
+      let returnchar .= c
+    endif
+    if c =~ '[[:space:]]'
+      let spacechars .= c
+    endif
+    if c =~ '[[:tab:]]'
+      let tabchar .= c
+    endif
+    if c =~ '[[:upper:]]'
+      let upperchars .= c
+    endif
     if c =~ '[[:xdigit:]]'
       let xdigitchars .= c
     endif
@@ -70,11 +106,22 @@ func s:classes_test()
   endwhile
 
   call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alphachars)
-  call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
-  call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
   call assert_equal('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alnumchars)
+  call assert_equal("\b", backspacechar)
+  call assert_equal("\t ", blankchars)
+  " Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
+  " call assert_equal("\x01\x02\x03\x04\x05\x06\x07\b\t\n\x0b\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\e\x1c\x1d\x1e\x1f\x7f", cntrlchars)
+  call assert_equal("0123456789", digitchars)
+  call assert_equal("\<Esc>", escapechar)
+  " Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
+  " call assert_equal('!"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~', graphchars)
+  call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
   call assert_equal(' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', printchars)
   call assert_equal('!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~', punctchars)
+  call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
+  call assert_equal("\r", returnchar)
+  call assert_equal("\t\n\x0b\f\r ", spacechars)
+  call assert_equal("\t", tabchar)
   call assert_equal('0123456789ABCDEFabcdef', xdigitchars)
 endfunc
author	KunMing Xie <qqzz014@gmail.com>	2018-06-02 01:57:22 +0800
committer	Justin M. Keyes <justinkz@gmail.com>	2018-06-01 19:57:22 +0200
commit	49a497a67c92f339ff9ce2939188b651e250367b (patch)
tree	6774ff316bd501ba21169b59cc5dbd0759376964
parent	c7350f542ade5ea4f19e490ef3638fbf8cf6db41 (diff)
download	rneovim-49a497a67c92f339ff9ce2939188b651e250367b.tar.gz rneovim-49a497a67c92f339ff9ce2939188b651e250367b.tar.bz2 rneovim-49a497a67c92f339ff9ce2939188b651e250367b.zip