vim-patch:9.1.1258: regexp: max \U and \%U value is limited by INT_MAX (#33156)

Problem: regexp: max \U and \%U value is limited by INT_MAX but gives a confusing error message (related: v8.1.0985). Solution: give a better error message when the value reaches INT_MAX When searching Vim allows to get up to 8 hex characters using the /\V and /\%V regex atoms. However, when using "/\UFFFFFFFF" the code point is already above what an integer variable can hold, which is 2,147,483,647. Since patch v8.1.0985, Vim already limited the max codepoint to INT_MAX (otherwise it caused a crash in the nfa regex engine), but instead of error'ing out it silently fell back to parse the number as a backslash value and not as a codepoint value and as such this "/[\UFFFFFFFF]" will happily find a "\" or an literal "F". And this "/[\d127-\UFFFFFFFF]" will error out as "reverse range in character class). Interestingly, the max Unicode codepoint value is U+10FFFF which still fits into an ordinary integer value, which means, that we don't even need to parse 8 hex characters, but 6 should have been enough. However, let's not limit Vim to search for only max 6 hex characters (which would be a backward incompatible change), but instead allow all 8 characters and only if the codepoint reaches INT_MAX, give a more precise error message (about what the max unicode codepoint value is). This allows to search for "[\U7FFFFFFE]" (will likely return "E486 Pattern not found") and "[/\U7FFFFFF]" now errors "E1517: Value too large, max Unicode codepoint is U+10FFFF". While this change is straight forward on architectures where long is 8 bytes, this is not so simple on Windows or 32bit architectures where long is 4 bytes (and therefore the test fails there). To account for that, let's make use of the vimlong_T number type and make a few corresponding changes in the regex engine code and cast the value to the expected data type. This however may not work correctly on systems that doesn't have the long long datatype (e.g. OpenVMS) and probably the test will fail there. fixes: vim/vim#16949 closes: vim/vim#16994 https://github.com/vim/vim/commit/f2b16986a194ab839c5a23bd7fe904f9fae1526f Co-authored-by: Christian Brabandt <cb@256bit.org>
author: zeertzjq <zeertzjq@outlook.com> 2025-03-29 21:05:03 +0800
committer: GitHub <noreply@github.com> 2025-03-29 13:05:03 +0000
commit: 89bc9455543abbd98bba752367ab5f2b83943931 (patch)
tree: 6576ffb34e5e9cd03b12b3348e49925a80150513
parent: 78d2e0b43e7a8dccbd4444a7d11e55d8f9a2d71b (diff)
download: rneovim-89bc9455543abbd98bba752367ab5f2b83943931.tar.gz
rneovim-89bc9455543abbd98bba752367ab5f2b83943931.tar.bz2
rneovim-89bc9455543abbd98bba752367ab5f2b83943931.zip
4 files changed, 55 insertions, 6 deletions
diff --git a/runtime/doc/pattern.txt b/runtime/doc/pattern.txt
index be913e941e..ec1eb9a4a0 100644
--- a/runtime/doc/pattern.txt
+++ b/runtime/doc/pattern.txt
@@ -1206,7 +1206,8 @@ x	A single character, with no special meaning, matches itself
 		\o40	octal number of character up to 0o377
 		\x20	hexadecimal number of character up to 0xff
 		\u20AC	hex. number of multibyte character up to 0xffff
-		\U1234	hex. number of multibyte character up to 0xffffffff
+		\U1234	hex. number of multibyte character up to 8 characters
+			0xffffffff |E1541|
 	  NOTE: The other backslash codes mentioned above do not work inside
 	  []!
 	- Matching with a collection can be slow, because each character in
@@ -1246,7 +1247,8 @@ x	A single character, with no special meaning, matches itself
 \%u20AC	Matches the character specified with up to four hexadecimal
 	characters.
 \%U1234abcd	Matches the character specified with up to eight hexadecimal
-	characters, up to 0x7fffffff
+	characters, up to 0x7fffffff (the maximum allowed value is INT_MAX
+	|E1541|, but the maximum valid Unicode codepoint is U+10FFFF).
 
 ==============================================================================
 7. Ignoring case in a pattern					*/ignorecase*
diff --git a/runtime/doc/vi_diff.txt b/runtime/doc/vi_diff.txt
index 0a0cbc8ec6..c6de169853 100644
--- a/runtime/doc/vi_diff.txt
+++ b/runtime/doc/vi_diff.txt
@@ -31,8 +31,11 @@ Maximum display width	   Unix and Win32: 1024 characters, otherwise 255
 Maximum lhs of a mapping   50 characters.
 Number of different highlighting types: over 30000
 Range of a Number variable:  -2147483648 to 2147483647 (might be more on 64
-			   bit systems)
+			   bit systems)  See also: |v:numbermax|,
+			   |v:numbermin| and |v:numbersize|
 Maximum length of a line in a tags file: 512 bytes.
+							*E1541*
+Maximum value for |/\U| and |/\%U|: 2147483647 (for 32bit integer).
 
 Information for undo and text in registers is kept in memory, thus when making
 (big) changes the amount of (virtual) memory available limits the number of
diff --git a/src/nvim/regexp.c b/src/nvim/regexp.c
index de9a7e580f..7a8d963dee 100644
--- a/src/nvim/regexp.c
+++ b/src/nvim/regexp.c
@@ -367,6 +367,8 @@ static const char e_nfa_regexp_missing_value_in_chr[]
 static const char e_atom_engine_must_be_at_start_of_pattern[]
   = N_("E1281: Atom '\\%%#=%c' must be at the start of the pattern");
 static const char e_substitute_nesting_too_deep[] = N_("E1290: substitute nesting too deep");
+static const char e_unicode_val_too_large[]
+  = N_("E1541: Value too large, max Unicode codepoint is U+10FFFF");
 
 #define NOT_MULTI       0
 #define MULTI_ONE       1
@@ -4796,6 +4798,11 @@ collection:
                        || *regparse == 'u'
                        || *regparse == 'U') {
               startc = coll_get_char();
+              // max UTF-8 Codepoint is U+10FFFF,
+              // but allow values until INT_MAX
+              if (startc == INT_MAX) {
+                EMSG_RET_NULL(_(e_unicode_val_too_large));
+              }
               if (startc == 0) {
                 regc(0x0a);
               } else {
@@ -5548,12 +5555,15 @@ static int coll_get_char(void)
   case 'U':
     nr = gethexchrs(8); break;
   }
-  if (nr < 0 || nr > INT_MAX) {
+  if (nr < 0) {
     // If getting the number fails be backwards compatible: the character
     // is a backslash.
     regparse--;
     nr = '\\';
   }
+  if (nr > INT_MAX) {
+    nr = INT_MAX;
+  }
   return (int)nr;
 }
 
@@ -10565,6 +10575,11 @@ collection:
                      || *regparse == 'U') {
             // TODO(RE): This needs more testing
             startc = coll_get_char();
+            // max UTF-8 Codepoint is U+10FFFF,
+            // but allow values until INT_MAX
+            if (startc == INT_MAX) {
+              EMSG_RET_FAIL(_(e_unicode_val_too_large));
+            }
             got_coll_char = true;
             MB_PTR_BACK(old_regparse, regparse);
           } else {
diff --git a/test/old/testdir/test_search.vim b/test/old/testdir/test_search.vim
index cd36f56f17..4e5cb574bd 100644
--- a/test/old/testdir/test_search.vim
+++ b/test/old/testdir/test_search.vim
@@ -1499,17 +1499,46 @@ func Test_large_hex_chars2()
   try
     /[\Ufffffc1f]
   catch
-    call assert_match('E486:', v:exception)
+    call assert_match('E1541:', v:exception)
   endtry
   try
     set re=1
     /[\Ufffffc1f]
   catch
-    call assert_match('E486:', v:exception)
+    call assert_match('E1541:', v:exception)
   endtry
   set re&
 endfunc
 
+func Test_large_hex_chars3()
+  " Validate max number of Unicode char
+  try
+    /[\UFFFFFFFF]
+  catch
+    call assert_match('E1541:', v:exception)
+  endtry
+  try
+    /[\UFFFFFFF]
+  catch
+    call assert_match('E486:', v:exception)
+  endtry
+  try
+    /\%#=2[\d32-\UFFFFFFFF]
+  catch
+    call assert_match('E1541:', v:exception)
+  endtry
+  try
+    /\%#=1[\UFFFFFFFF]
+  catch
+    call assert_match('E1541:', v:exception)
+  endtry
+  try
+    /\%#=1[\d32-\UFFFFFFFF]
+  catch
+    call assert_match('E945:', v:exception)
+  endtry
+endfunc
+
 func Test_one_error_msg()
   " This was also giving an internal error
   call assert_fails('call search(" \\((\\v[[=P=]]){185}+             ")', 'E871:')
author	zeertzjq <zeertzjq@outlook.com>	2025-03-29 21:05:03 +0800
committer	GitHub <noreply@github.com>	2025-03-29 13:05:03 +0000
commit	89bc9455543abbd98bba752367ab5f2b83943931 (patch)
tree	6576ffb34e5e9cd03b12b3348e49925a80150513
parent	78d2e0b43e7a8dccbd4444a7d11e55d8f9a2d71b (diff)
download	rneovim-89bc9455543abbd98bba752367ab5f2b83943931.tar.gz rneovim-89bc9455543abbd98bba752367ab5f2b83943931.tar.bz2 rneovim-89bc9455543abbd98bba752367ab5f2b83943931.zip