vim-patch:9.0.1485: no functions for converting from/to UTF-16 index (#23318)

Problem: no functions for converting from/to UTF-16 index. Solution: Add UTF-16 flag to existing funtions and add strutf16len() and utf16idx(). (Yegappan Lakshmanan, closes vim/vim#12216) https://github.com/vim/vim/commit/67672ef097dd708244ff042a8364994da2b91e75 Co-authored-by: Yegappan Lakshmanan <yegappan@yahoo.com>
author: zeertzjq <zeertzjq@outlook.com> 2023-04-26 09:50:37 +0800
committer: GitHub <noreply@github.com> 2023-04-26 09:50:37 +0800
commit: 191e8b40625731a652bade7000911554834afe5f (patch)
tree: ba2267f67dc2fc833de914708efde646d9df466b
parent: 8af97ecefa71c6391a52ab799d354e058cb470be (diff)
download: rneovim-191e8b40625731a652bade7000911554834afe5f.tar.gz
rneovim-191e8b40625731a652bade7000911554834afe5f.tar.bz2
rneovim-191e8b40625731a652bade7000911554834afe5f.zip
6 files changed, 664 insertions, 44 deletions
diff --git a/runtime/doc/builtin.txt b/runtime/doc/builtin.txt
index 1d0cb6ebc4..b37ac117f3 100644
--- a/runtime/doc/builtin.txt
+++ b/runtime/doc/builtin.txt
@@ -69,8 +69,10 @@ bufnr([{buf} [, {create}]])	Number	Number of the buffer {buf}
 bufwinid({buf})			Number	window ID of buffer {buf}
 bufwinnr({buf})			Number	window number of buffer {buf}
 byte2line({byte})		Number	line number at byte count {byte}
-byteidx({expr}, {nr})		Number	byte index of {nr}th char in {expr}
-byteidxcomp({expr}, {nr})	Number	byte index of {nr}th char in {expr}
+byteidx({expr}, {nr} [, {utf16}])
+				Number	byte index of {nr}th char in {expr}
+byteidxcomp({expr}, {nr} [, {utf16}])
+				Number	byte index of {nr}th char in {expr}
 call({func}, {arglist} [, {dict}])
 				any	call {func} with arguments {arglist}
 ceil({expr})			Float	round {expr} up
@@ -80,7 +82,7 @@ chansend({id}, {data})		Number	Writes {data} to channel
 char2nr({expr} [, {utf8}])	Number	ASCII/UTF-8 value of first char in {expr}
 charclass({string})		Number	character class of {string}
 charcol({expr} [, {winid}])	Number	column number of cursor or mark
-charidx({string}, {idx} [, {countcc}])
+charidx({string}, {idx} [, {countcc} [, {utf16}]])
 				Number	char index of byte {idx} in {string}
 chdir({dir})			String	change current working directory
 cindent({lnum})			Number	C indent for line {lnum}
@@ -501,6 +503,8 @@ strptime({format}, {timestring})
 strridx({haystack}, {needle} [, {start}])
 				Number	last index of {needle} in {haystack}
 strtrans({expr})		String	translate string to make it printable
+strutf16len({string} [, {countcc}])
+				Number	number of UTF-16 code units in {string}
 strwidth({expr})		Number	display cell length of the String {expr}
 submatch({nr} [, {list}])	String or List
 					specific match in ":s" or substitute()
@@ -545,6 +549,8 @@ undofile({name})		String	undo file name for {name}
 undotree()			List	undo file tree
 uniq({list} [, {func} [, {dict}]])
 				List	remove adjacent duplicates from a list
+utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
+				Number	UTF-16 index of byte {idx} in {string}
 values({dict})			List	values in {dict}
 virtcol({expr} [, {list}])	Number or List
 					screen column of cursor or mark
@@ -982,7 +988,7 @@ byte2line({byte})					*byte2line()*
 		Can also be used as a |method|: >
 			GetOffset()->byte2line()
 
-byteidx({expr}, {nr})					*byteidx()*
+byteidx({expr}, {nr} [, {utf16}])			*byteidx()*
 		Return byte index of the {nr}th character in the String
 		{expr}.  Use zero for the first character, it then returns
 		zero.
@@ -992,6 +998,13 @@ byteidx({expr}, {nr})					*byteidx()*
 		length is added to the preceding base character.  See
 		|byteidxcomp()| below for counting composing characters
 		separately.
+		When {utf16} is present and TRUE, {nr} is used as the UTF-16
+		index in the String {expr} instead of as the character index.
+		The UTF-16 index is the index in the string when it is encoded
+		with 16-bit words.  If the specified UTF-16 index is in the
+		middle of a character (e.g. in a 4-byte character), then the
+		byte index of the first byte in the character is returned.
+		Refer to |string-offset-encoding| for more information.
 		Example : >
 			echo matchstr(str, ".", byteidx(str, 3))
 <		will display the fourth character.  Another way to do the
@@ -1003,11 +1016,17 @@ byteidx({expr}, {nr})					*byteidx()*
 		If there are less than {nr} characters -1 is returned.
 		If there are exactly {nr} characters the length of the string
 		in bytes is returned.
-
+		See |charidx()| and |utf16idx()| for getting the character and
+		UTF-16 index respectively from the byte index.
+		Examples: >
+			echo byteidx('a😊😊', 2)	returns 5
+			echo byteidx('a😊😊', 2, 1)	returns 1
+			echo byteidx('a😊😊', 3, 1)	returns 5
+<
 		Can also be used as a |method|: >
 			GetName()->byteidx(idx)
 
-byteidxcomp({expr}, {nr})					*byteidxcomp()*
+byteidxcomp({expr}, {nr} [, {utf16}])			*byteidxcomp()*
 		Like byteidx(), except that a composing character is counted
 		as a separate character.  Example: >
 			let s = 'e' .. nr2char(0x301)
@@ -1131,27 +1150,36 @@ charcol({expr} [, {winid}])				*charcol()*
 			GetPos()->col()
 <
 							*charidx()*
-charidx({string}, {idx} [, {countcc}])
+charidx({string}, {idx} [, {countcc} [, {utf16}]])
 		Return the character index of the byte at {idx} in {string}.
 		The index of the first character is zero.
 		If there are no multibyte characters the returned value is
 		equal to {idx}.
+
 		When {countcc} is omitted or |FALSE|, then composing characters
-		are not counted separately, their byte length is
-		added to the preceding base character.
+		are not counted separately, their byte length is added to the
+		preceding base character.
 		When {countcc} is |TRUE|, then composing characters are
 		counted as separate characters.
+
+		When {utf16} is present and TRUE, {idx} is used as the UTF-16
+		index in the String {expr} instead of as the byte index.
+
 		Returns -1 if the arguments are invalid or if {idx} is greater
 		than the index of the last byte in {string}.  An error is
 		given if the first argument is not a string, the second
 		argument is not a number or when the third argument is present
 		and is not zero or one.
+
 		See |byteidx()| and |byteidxcomp()| for getting the byte index
-		from the character index.
+		from the character index and |utf16idx()| for getting the
+		UTF-16 index from the character index.
+		Refer to |string-offset-encoding| for more information.
 		Examples: >
 			echo charidx('áb́ć', 3)		returns 1
 			echo charidx('áb́ć', 6, 1)	returns 4
 			echo charidx('áb́ć', 16)		returns -1
+			echo charidx('a😊😊', 4, 0, 1)	returns 2
 <
 		Can also be used as a |method|: >
 			GetName()->charidx(idx)
@@ -8332,6 +8360,28 @@ strtrans({string})					*strtrans()*
 		Can also be used as a |method|: >
 			GetString()->strtrans()
 
+strutf16len({string} [, {countcc}])			*strutf16len()*
+		The result is a Number, which is the number of UTF-16 code
+		units in String {string} (after converting it to UTF-16).
+
+		When {countcc} is TRUE, composing characters are counted
+		separately.
+		When {countcc} is omitted or FALSE, composing characters are
+		ignored.
+
+		Returns zero on error.
+
+		Also see |strlen()| and |strcharlen()|.
+		Examples: >
+		    echo strutf16len('a')		returns 1
+		    echo strutf16len('©')		returns 1
+		    echo strutf16len('😊')		returns 2
+		    echo strutf16len('ą́')		returns 1
+		    echo strutf16len('ą́', v:true)	returns 3
+
+		Can also be used as a |method|: >
+			GetText()->strutf16len()
+<
 strwidth({string})					*strwidth()*
 		The result is a Number, which is the number of display cells
 		String {string} occupies.  A Tab character is counted as one
@@ -9063,6 +9113,34 @@ uniq({list} [, {func} [, {dict}]])			*uniq()* *E882*
 
 		Can also be used as a |method|: >
 			mylist->uniq()
+<
+							*utf16idx()*
+utf16idx({string}, {idx} [, {countcc} [, {charidx}]])
+		Same as |charidx()| but returns the UTF-16 index of the byte
+		at {idx} in {string} (after converting it to UTF-16).
+
+		When {charidx} is present and TRUE, {idx} is used as the
+		character index in the String {string} instead of as the byte
+		index.
+		An {idx} in the middle of a UTF-8 sequence is rounded upwards
+		to the end of that sequence.
+
+		See |byteidx()| and |byteidxcomp()| for getting the byte index
+		from the UTF-16 index and |charidx()| for getting the
+		character index from the UTF-16 index.
+		Refer to |string-offset-encoding| for more information.
+		Examples: >
+			echo utf16idx('a😊😊', 3)	returns 2
+			echo utf16idx('a😊😊', 7)	returns 4
+			echo utf16idx('a😊😊', 1, 0, 1)	returns 2
+			echo utf16idx('a😊😊', 2, 0, 1)	returns 4
+			echo utf16idx('aą́c', 6)		returns 2
+			echo utf16idx('aą́c', 6, 1)	returns 4
+			echo utf16idx('a😊😊', 9)	returns -1
+<
+		Can also be used as a |method|: >
+			GetName()->utf16idx(idx)
+
 
 values({dict})						*values()*
 		Return a |List| with all the values of {dict}.  The |List| is
diff --git a/runtime/doc/eval.txt b/runtime/doc/eval.txt
index f80ca5346c..0c18fd5b4e 100644
--- a/runtime/doc/eval.txt
+++ b/runtime/doc/eval.txt
@@ -1433,6 +1433,32 @@ Examples: >
 	echo $"The square root of {{9}} is {sqrt(9)}"
 <	The square root of {9} is 3.0 ~
 
+						*string-offset-encoding*
+A string consists of multiple characters.  UTF-8 uses one byte for ASCII
+characters, two bytes for other latin characters and more bytes for other
+characters.
+
+A string offset can count characters or bytes.  Other programs may use
+UTF-16 encoding (16-bit words) and an offset of UTF-16 words.  Some functions
+use byte offsets, usually for UTF-8 encoding.  Other functions use character
+offsets, in which case the encoding doesn't matter.
+
+The different offsets for the string "a©😊" are below:
+
+  UTF-8 offsets:
+      [0]: 61, [1]: C2, [2]: A9, [3]: F0, [4]: 9F, [5]: 98, [6]: 8A
+  UTF-16 offsets:
+      [0]: 0061, [1]: 00A9, [2]: D83D, [3]: DE0A
+  UTF-32 (character) offsets:
+      [0]: 00000061, [1]: 000000A9, [2]: 0001F60A
+
+You can use the "g8" and "ga" commands on a character to see the
+decimal/hex/octal values.
+
+The functions |byteidx()|, |utf16idx()| and |charidx()| can be used to convert
+between these indices.  The functions |strlen()|, |strutf16len()| and
+|strcharlen()| return the number of bytes, UTF-16 code units and characters in
+a string respectively.
 
 ------------------------------------------------------------------------------
 option						*expr-option* *E112* *E113*
diff --git a/runtime/doc/usr_41.txt b/runtime/doc/usr_41.txt
index 89111535ca..8e1b72eadc 100644
--- a/runtime/doc/usr_41.txt
+++ b/runtime/doc/usr_41.txt
@@ -621,6 +621,7 @@ String manipulation:					*string-functions*
 	strlen()		length of a string in bytes
 	strcharlen()		length of a string in characters
 	strchars()		number of characters in a string
+	strutf16len()		number of UTF-16 code units in a string
 	strwidth()		size of string when displayed
 	strdisplaywidth()	size of string when displayed, deals with tabs
 	setcellwidths()		set character cell width overrides
@@ -636,6 +637,7 @@ String manipulation:					*string-functions*
 	byteidx()		byte index of a character in a string
 	byteidxcomp()		like byteidx() but count composing characters
 	charidx()		character index of a byte in a string
+	utf16idx()		UTF-16 index of a byte in a string
 	repeat()		repeat a string multiple times
 	eval()			evaluate a string expression
 	execute()		execute an Ex command and get the output
diff --git a/src/nvim/eval.lua b/src/nvim/eval.lua
index 357ecd5575..09705148d0 100644
--- a/src/nvim/eval.lua
+++ b/src/nvim/eval.lua
@@ -65,8 +65,8 @@ return {
     bufwinid={args=1, base=1},
     bufwinnr={args=1, base=1},
     byte2line={args=1, base=1},
-    byteidx={args=2, base=1, fast=true},
-    byteidxcomp={args=2, base=1, fast=true},
+    byteidx={args={2, 3}, base=1, fast=true},
+    byteidxcomp={args={2, 3}, base=1, fast=true},
     call={args={2, 3}, base=1},
     ceil={args=1, base=1, float_func="ceil"},
     changenr={},
@@ -75,7 +75,7 @@ return {
     char2nr={args={1, 2}, base=1, fast=true},
     charclass={args=1, base=1},
     charcol={args={1, 2}, base=1},
-    charidx={args={2, 3}, base=1},
+    charidx={args={2, 4}, base=1},
     chdir={args=1, base=1},
     cindent={args=1, base=1},
     clearmatches={args={0, 1}, base=1},
@@ -397,6 +397,7 @@ return {
     strptime={args=2, base=1},
     strridx={args={2, 3}, base=1},
     strtrans={args=1, base=1, fast=true},
+    strutf16len={args={1, 2}, base=1},
     strwidth={args=1, base=1, fast=true},
     submatch={args={1, 2}, base=1},
     substitute={args=4, base=1},
@@ -435,6 +436,7 @@ return {
     undofile={args=1, base=1},
     undotree={},
     uniq={args={1, 3}, base=1},
+    utf16idx={args={2, 4}, base=1},
     values={args=1, base=1},
     virtcol={args={1, 2}, base=1},
     virtcol2col={args=3, base=1},
diff --git a/src/nvim/strings.c b/src/nvim/strings.c
index d5d7d62c38..e8c04aa5c7 100644
--- a/src/nvim/strings.c
+++ b/src/nvim/strings.c
@@ -1504,22 +1504,44 @@ char *strrep(const char *src, const char *what, const char *rep)
 
 static void byteidx(typval_T *argvars, typval_T *rettv, int comp)
 {
+  rettv->vval.v_number = -1;
+
   const char *const str = tv_get_string_chk(&argvars[0]);
   varnumber_T idx = tv_get_number_chk(&argvars[1], NULL);
-  rettv->vval.v_number = -1;
   if (str == NULL || idx < 0) {
     return;
   }
 
+  varnumber_T utf16idx = false;
+  if (argvars[2].v_type != VAR_UNKNOWN) {
+    utf16idx = tv_get_bool(&argvars[2]);
+    if (utf16idx < 0 || utf16idx > 1) {
+      semsg(_(e_using_number_as_bool_nr), utf16idx);
+      return;
+    }
+  }
+
+  int (*ptr2len)(const char *);
+  if (comp) {
+    ptr2len = utf_ptr2len;
+  } else {
+    ptr2len = utfc_ptr2len;
+  }
+
   const char *t = str;
   for (; idx > 0; idx--) {
     if (*t == NUL) {  // EOL reached.
       return;
     }
-    if (comp) {
-      t += utf_ptr2len(t);
-    } else {
-      t += utfc_ptr2len(t);
+    if (utf16idx) {
+      const int clen = ptr2len(t);
+      const int c = (clen > 1) ? utf_ptr2char(t) : *t;
+      if (c > 0xFFFF) {
+        idx--;
+      }
+    }
+    if (idx > 0) {
+      t += ptr2len(t);
     }
   }
   rettv->vval.v_number = (varnumber_T)(t - str);
@@ -1542,24 +1564,27 @@ void f_charidx(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
 {
   rettv->vval.v_number = -1;
 
-  if ((tv_check_for_string_arg(argvars, 0) == FAIL
-       || tv_check_for_number_arg(argvars, 1) == FAIL
-       || tv_check_for_opt_bool_arg(argvars, 2) == FAIL)) {
+  if (tv_check_for_string_arg(argvars, 0) == FAIL
+      || tv_check_for_number_arg(argvars, 1) == FAIL
+      || tv_check_for_opt_bool_arg(argvars, 2) == FAIL
+      || (argvars[2].v_type != VAR_UNKNOWN
+          && tv_check_for_opt_bool_arg(argvars, 3) == FAIL)) {
     return;
   }
 
-  const char *str = tv_get_string_chk(&argvars[0]);
+  const char *const str = tv_get_string_chk(&argvars[0]);
   varnumber_T idx = tv_get_number_chk(&argvars[1], NULL);
   if (str == NULL || idx < 0) {
     return;
   }
-  int countcc = 0;
+
+  varnumber_T countcc = false;
+  varnumber_T utf16idx = false;
   if (argvars[2].v_type != VAR_UNKNOWN) {
-    countcc = (int)tv_get_number(&argvars[2]);
-  }
-  if (countcc < 0 || countcc > 1) {
-    semsg(_(e_using_number_as_bool_nr), countcc);
-    return;
+    countcc = tv_get_bool(&argvars[2]);
+    if (argvars[3].v_type != VAR_UNKNOWN) {
+      utf16idx = tv_get_bool(&argvars[3]);
+    }
   }
 
   int (*ptr2len)(const char *);
@@ -1571,10 +1596,18 @@ void f_charidx(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
 
   const char *p;
   int len;
-  for (p = str, len = 0; p <= str + idx; len++) {
+  for (p = str, len = 0; utf16idx ? idx >= 0 : p <= str + idx; len++) {
     if (*p == NUL) {
       return;
     }
+    if (utf16idx) {
+      idx--;
+      const int clen = ptr2len(p);
+      const int c = (clen > 1) ? utf_ptr2char(p) : *p;
+      if (c > 0xFFFF) {
+        idx--;
+      }
+    }
     p += ptr2len(p);
   }
 
@@ -1743,6 +1776,36 @@ void f_strchars(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
   }
 }
 
+/// "strutf16len()" function
+void f_strutf16len(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
+{
+  rettv->vval.v_number = -1;
+
+  if (tv_check_for_string_arg(argvars, 0) == FAIL
+      || tv_check_for_opt_bool_arg(argvars, 1) == FAIL) {
+    return;
+  }
+
+  varnumber_T countcc = false;
+  if (argvars[1].v_type != VAR_UNKNOWN) {
+    countcc = tv_get_bool(&argvars[1]);
+  }
+
+  const char *s = tv_get_string(&argvars[0]);
+  varnumber_T len = 0;
+  int (*func_mb_ptr2char_adv)(const char **pp);
+
+  func_mb_ptr2char_adv = countcc ? mb_cptr2char_adv : mb_ptr2char_adv;
+  while (*s != NUL) {
+    const int ch = func_mb_ptr2char_adv(&s);
+    if (ch > 0xFFFF) {
+      len++;
+    }
+    len++;
+  }
+  rettv->vval.v_number = len;
+}
+
 /// "strdisplaywidth()" function
 void f_strdisplaywidth(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
 {
@@ -1914,6 +1977,61 @@ void f_strtrans(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
   rettv->vval.v_string = transstr(tv_get_string(&argvars[0]), true);
 }
 
+/// "utf16idx()" function
+void f_utf16idx(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
+{
+  rettv->vval.v_number = -1;
+
+  if (tv_check_for_string_arg(argvars, 0) == FAIL
+      || tv_check_for_opt_number_arg(argvars, 1) == FAIL
+      || tv_check_for_opt_bool_arg(argvars, 2) == FAIL
+      || (argvars[2].v_type != VAR_UNKNOWN
+          && tv_check_for_opt_bool_arg(argvars, 3) == FAIL)) {
+    return;
+  }
+
+  const char *const str = tv_get_string_chk(&argvars[0]);
+  varnumber_T idx = tv_get_number_chk(&argvars[1], NULL);
+  if (str == NULL || idx < 0) {
+    return;
+  }
+
+  varnumber_T countcc = false;
+  varnumber_T charidx = false;
+  if (argvars[2].v_type != VAR_UNKNOWN) {
+    countcc = tv_get_bool(&argvars[2]);
+    if (argvars[3].v_type != VAR_UNKNOWN) {
+      charidx = tv_get_bool(&argvars[3]);
+    }
+  }
+
+  int (*ptr2len)(const char *);
+  if (countcc) {
+    ptr2len = utf_ptr2len;
+  } else {
+    ptr2len = utfc_ptr2len;
+  }
+
+  const char *p;
+  int len;
+  for (p = str, len = 0; charidx ? idx >= 0 : p <= str + idx; len++) {
+    if (*p == NUL) {
+      return;
+    }
+    const int clen = ptr2len(p);
+    const int c = (clen > 1) ? utf_ptr2char(p) : *p;
+    if (c > 0xFFFF) {
+      len++;
+    }
+    p += ptr2len(p);
+    if (charidx) {
+      idx--;
+    }
+  }
+
+  rettv->vval.v_number = len > 0 ? len - 1 : 0;
+}
+
 /// "tolower(string)" function
 void f_tolower(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
 {
diff --git a/test/old/testdir/test_functions.vim b/test/old/testdir/test_functions.vim
index 99ba711d23..b934f7fac2 100644
--- a/test/old/testdir/test_functions.vim
+++ b/test/old/testdir/test_functions.vim
@@ -1064,19 +1064,14 @@ func Test_byte2line_line2byte()
   bw!
 endfunc
 
-" Test for byteidx() and byteidxcomp() functions
+" Test for byteidx() using a character index
 func Test_byteidx()
   let a = '.é.' " one char of two bytes
   call assert_equal(0, byteidx(a, 0))
-  call assert_equal(0, byteidxcomp(a, 0))
   call assert_equal(1, byteidx(a, 1))
-  call assert_equal(1, byteidxcomp(a, 1))
   call assert_equal(3, byteidx(a, 2))
-  call assert_equal(3, byteidxcomp(a, 2))
   call assert_equal(4, byteidx(a, 3))
-  call assert_equal(4, byteidxcomp(a, 3))
   call assert_equal(-1, byteidx(a, 4))
-  call assert_equal(-1, byteidxcomp(a, 4))
 
   let b = '.é.' " normal e with composing char
   call assert_equal(0, b->byteidx(0))
@@ -1084,18 +1079,184 @@ func Test_byteidx()
   call assert_equal(4, b->byteidx(2))
   call assert_equal(5, b->byteidx(3))
   call assert_equal(-1, b->byteidx(4))
+
+  " string with multiple composing characters
+  let str = '-ą́-ą́'
+  call assert_equal(0, byteidx(str, 0))
+  call assert_equal(1, byteidx(str, 1))
+  call assert_equal(6, byteidx(str, 2))
+  call assert_equal(7, byteidx(str, 3))
+  call assert_equal(12, byteidx(str, 4))
+  call assert_equal(-1, byteidx(str, 5))
+
+  " empty string
+  call assert_equal(0, byteidx('', 0))
+  call assert_equal(-1, byteidx('', 1))
+
+  " error cases
   call assert_fails("call byteidx([], 0)", 'E730:')
+  call assert_fails("call byteidx('abc', [])", 'E745:')
+endfunc
+
+" Test for byteidxcomp() using a character index
+func Test_byteidxcomp()
+  let a = '.é.' " one char of two bytes
+  call assert_equal(0, byteidxcomp(a, 0))
+  call assert_equal(1, byteidxcomp(a, 1))
+  call assert_equal(3, byteidxcomp(a, 2))
+  call assert_equal(4, byteidxcomp(a, 3))
+  call assert_equal(-1, byteidxcomp(a, 4))
 
+  let b = '.é.' " normal e with composing char
   call assert_equal(0, b->byteidxcomp(0))
   call assert_equal(1, b->byteidxcomp(1))
   call assert_equal(2, b->byteidxcomp(2))
   call assert_equal(4, b->byteidxcomp(3))
   call assert_equal(5, b->byteidxcomp(4))
   call assert_equal(-1, b->byteidxcomp(5))
+
+  " string with multiple composing characters
+  let str = '-ą́-ą́'
+  call assert_equal(0, byteidxcomp(str, 0))
+  call assert_equal(1, byteidxcomp(str, 1))
+  call assert_equal(2, byteidxcomp(str, 2))
+  call assert_equal(4, byteidxcomp(str, 3))
+  call assert_equal(6, byteidxcomp(str, 4))
+  call assert_equal(7, byteidxcomp(str, 5))
+  call assert_equal(8, byteidxcomp(str, 6))
+  call assert_equal(10, byteidxcomp(str, 7))
+  call assert_equal(12, byteidxcomp(str, 8))
+  call assert_equal(-1, byteidxcomp(str, 9))
+
+  " empty string
+  call assert_equal(0, byteidxcomp('', 0))
+  call assert_equal(-1, byteidxcomp('', 1))
+
+  " error cases
   call assert_fails("call byteidxcomp([], 0)", 'E730:')
+  call assert_fails("call byteidxcomp('abc', [])", 'E745:')
 endfunc
 
-" Test for charidx()
+" Test for byteidx() using a UTF-16 index
+func Test_byteidx_from_utf16_index()
+  " string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, byteidx(str, i, v:true))
+  endfor
+  call assert_equal(3, byteidx(str, 3, v:true))
+  call assert_equal(-1, byteidx(str, 4, v:true))
+
+  " string with two byte characters
+  let str = "a©©b"
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(3, byteidx(str, 2, v:true))
+  call assert_equal(5, byteidx(str, 3, v:true))
+  call assert_equal(6, byteidx(str, 4, v:true))
+  call assert_equal(-1, byteidx(str, 5, v:true))
+
+  " string with two byte characters
+  let str = "a😊😊b"
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(1, byteidx(str, 2, v:true))
+  call assert_equal(5, byteidx(str, 3, v:true))
+  call assert_equal(5, byteidx(str, 4, v:true))
+  call assert_equal(9, byteidx(str, 5, v:true))
+  call assert_equal(10, byteidx(str, 6, v:true))
+  call assert_equal(-1, byteidx(str, 7, v:true))
+
+  " string with composing characters
+  let str = '-á-b́'
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(4, byteidx(str, 2, v:true))
+  call assert_equal(5, byteidx(str, 3, v:true))
+  call assert_equal(8, byteidx(str, 4, v:true))
+  call assert_equal(-1, byteidx(str, 5, v:true))
+
+  " string with multiple composing characters
+  let str = '-ą́-ą́'
+  call assert_equal(0, byteidx(str, 0, v:true))
+  call assert_equal(1, byteidx(str, 1, v:true))
+  call assert_equal(6, byteidx(str, 2, v:true))
+  call assert_equal(7, byteidx(str, 3, v:true))
+  call assert_equal(12, byteidx(str, 4, v:true))
+  call assert_equal(-1, byteidx(str, 5, v:true))
+
+  " empty string
+  call assert_equal(0, byteidx('', 0, v:true))
+  call assert_equal(-1, byteidx('', 1, v:true))
+
+  " error cases
+  call assert_fails('call byteidx(str, 0, [])', 'E745:')
+endfunc
+
+" Test for byteidxcomp() using a UTF-16 index
+func Test_byteidxcomp_from_utf16_index()
+  " string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, byteidxcomp(str, i, v:true))
+  endfor
+  call assert_equal(3, byteidxcomp(str, 3, v:true))
+  call assert_equal(-1, byteidxcomp(str, 4, v:true))
+
+  " string with two byte characters
+  let str = "a©©b"
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(3, byteidxcomp(str, 2, v:true))
+  call assert_equal(5, byteidxcomp(str, 3, v:true))
+  call assert_equal(6, byteidxcomp(str, 4, v:true))
+  call assert_equal(-1, byteidxcomp(str, 5, v:true))
+
+  " string with two byte characters
+  let str = "a😊😊b"
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(1, byteidxcomp(str, 2, v:true))
+  call assert_equal(5, byteidxcomp(str, 3, v:true))
+  call assert_equal(5, byteidxcomp(str, 4, v:true))
+  call assert_equal(9, byteidxcomp(str, 5, v:true))
+  call assert_equal(10, byteidxcomp(str, 6, v:true))
+  call assert_equal(-1, byteidxcomp(str, 7, v:true))
+
+  " string with composing characters
+  let str = '-á-b́'
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(2, byteidxcomp(str, 2, v:true))
+  call assert_equal(4, byteidxcomp(str, 3, v:true))
+  call assert_equal(5, byteidxcomp(str, 4, v:true))
+  call assert_equal(6, byteidxcomp(str, 5, v:true))
+  call assert_equal(8, byteidxcomp(str, 6, v:true))
+  call assert_equal(-1, byteidxcomp(str, 7, v:true))
+  call assert_fails('call byteidxcomp(str, 0, [])', 'E745:')
+
+  " string with multiple composing characters
+  let str = '-ą́-ą́'
+  call assert_equal(0, byteidxcomp(str, 0, v:true))
+  call assert_equal(1, byteidxcomp(str, 1, v:true))
+  call assert_equal(2, byteidxcomp(str, 2, v:true))
+  call assert_equal(4, byteidxcomp(str, 3, v:true))
+  call assert_equal(6, byteidxcomp(str, 4, v:true))
+  call assert_equal(7, byteidxcomp(str, 5, v:true))
+  call assert_equal(8, byteidxcomp(str, 6, v:true))
+  call assert_equal(10, byteidxcomp(str, 7, v:true))
+  call assert_equal(12, byteidxcomp(str, 8, v:true))
+  call assert_equal(-1, byteidxcomp(str, 9, v:true))
+
+  " empty string
+  call assert_equal(0, byteidxcomp('', 0, v:true))
+  call assert_equal(-1, byteidxcomp('', 1, v:true))
+
+  " error cases
+  call assert_fails('call byteidxcomp(str, 0, [])', 'E745:')
+endfunc
+
+" Test for charidx() using a byte index
 func Test_charidx()
   let a = 'xáb́y'
   call assert_equal(0, charidx(a, 0))
@@ -1104,17 +1265,20 @@ func Test_charidx()
   call assert_equal(3, charidx(a, 7))
   call assert_equal(-1, charidx(a, 8))
   call assert_equal(-1, charidx(a, -1))
-  call assert_equal(-1, charidx('', 0))
-  call assert_equal(-1, charidx(v:_null_string, 0))
 
   " count composing characters
-  call assert_equal(0, charidx(a, 0, 1))
-  call assert_equal(2, charidx(a, 2, 1))
-  call assert_equal(3, charidx(a, 4, 1))
-  call assert_equal(5, charidx(a, 7, 1))
-  call assert_equal(-1, charidx(a, 8, 1))
+  call assert_equal(0, a->charidx(0, 1))
+  call assert_equal(2, a->charidx(2, 1))
+  call assert_equal(3, a->charidx(4, 1))
+  call assert_equal(5, a->charidx(7, 1))
+  call assert_equal(-1, a->charidx(8, 1))
+
+  " empty string
+  call assert_equal(-1, charidx('', 0))
   call assert_equal(-1, charidx('', 0, 1))
 
+  " error cases
+  call assert_equal(-1, charidx(v:_null_string, 0))
   call assert_fails('let x = charidx([], 1)', 'E1174:')
   call assert_fails('let x = charidx("abc", [])', 'E1210:')
   call assert_fails('let x = charidx("abc", 1, [])', 'E1212:')
@@ -1122,6 +1286,237 @@ func Test_charidx()
   call assert_fails('let x = charidx("abc", 1, 2)', 'E1212:')
 endfunc
 
+" Test for charidx() using a UTF-16 index
+func Test_charidx_from_utf16_index()
+  " string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, charidx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 3, v:false, v:true))
+
+  " string with two byte characters
+  let str = "a©©b"
+  call assert_equal(0, charidx(str, 0, v:false, v:true))
+  call assert_equal(1, charidx(str, 1, v:false, v:true))
+  call assert_equal(2, charidx(str, 2, v:false, v:true))
+  call assert_equal(3, charidx(str, 3, v:false, v:true))
+  call assert_equal(-1, charidx(str, 4, v:false, v:true))
+
+  " string with four byte characters
+  let str = "a😊😊b"
+  call assert_equal(0, charidx(str, 0, v:false, v:true))
+  call assert_equal(1, charidx(str, 1, v:false, v:true))
+  call assert_equal(1, charidx(str, 2, v:false, v:true))
+  call assert_equal(2, charidx(str, 3, v:false, v:true))
+  call assert_equal(2, charidx(str, 4, v:false, v:true))
+  call assert_equal(3, charidx(str, 5, v:false, v:true))
+  call assert_equal(-1, charidx(str, 6, v:false, v:true))
+
+  " string with composing characters
+  let str = '-á-b́'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, charidx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, charidx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 6, v:true, v:true))
+
+  " string with multiple composing characters
+  let str = '-ą́-ą́'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, charidx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, charidx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, charidx(str, 8, v:true, v:true))
+
+  " empty string
+  call assert_equal(-1, charidx('', 0, v:false, v:true))
+  call assert_equal(-1, charidx('', 0, v:true, v:true))
+
+  " error cases
+  call assert_equal(-1, charidx('', 0, v:false, v:true))
+  call assert_equal(-1, charidx('', 0, v:true, v:true))
+  call assert_equal(-1, charidx(v:_null_string, 0, v:false, v:true))
+  call assert_fails('let x = charidx("abc", 1, v:false, [])', 'E1212:')
+  call assert_fails('let x = charidx("abc", 1, v:true, [])', 'E1212:')
+endfunc
+
+" Test for utf16idx() using a byte index
+func Test_utf16idx_from_byteidx()
+  " UTF-16 index of a string with single byte characters
+  let str = "abc"
+  for i in range(3)
+    call assert_equal(i, utf16idx(str, i))
+  endfor
+  call assert_equal(-1, utf16idx(str, 3))
+
+  " UTF-16 index of a string with two byte characters
+  let str = 'a©©b'
+  call assert_equal(0, str->utf16idx(0))
+  call assert_equal(1, str->utf16idx(1))
+  call assert_equal(1, str->utf16idx(2))
+  call assert_equal(2, str->utf16idx(3))
+  call assert_equal(2, str->utf16idx(4))
+  call assert_equal(3, str->utf16idx(5))
+  call assert_equal(-1, str->utf16idx(6))
+
+  " UTF-16 index of a string with four byte characters
+  let str = 'a😊😊b'
+  call assert_equal(0, utf16idx(str, 0))
+  call assert_equal(2, utf16idx(str, 1))
+  call assert_equal(2, utf16idx(str, 2))
+  call assert_equal(2, utf16idx(str, 3))
+  call assert_equal(2, utf16idx(str, 4))
+  call assert_equal(4, utf16idx(str, 5))
+  call assert_equal(4, utf16idx(str, 6))
+  call assert_equal(4, utf16idx(str, 7))
+  call assert_equal(4, utf16idx(str, 8))
+  call assert_equal(5, utf16idx(str, 9))
+  call assert_equal(-1, utf16idx(str, 10))
+
+  " UTF-16 index of a string with composing characters
+  let str = '-á-b́'
+  call assert_equal(0, utf16idx(str, 0))
+  call assert_equal(1, utf16idx(str, 1))
+  call assert_equal(1, utf16idx(str, 2))
+  call assert_equal(1, utf16idx(str, 3))
+  call assert_equal(2, utf16idx(str, 4))
+  call assert_equal(3, utf16idx(str, 5))
+  call assert_equal(3, utf16idx(str, 6))
+  call assert_equal(3, utf16idx(str, 7))
+  call assert_equal(-1, utf16idx(str, 8))
+  call assert_equal(0, utf16idx(str, 0, v:true))
+  call assert_equal(1, utf16idx(str, 1, v:true))
+  call assert_equal(2, utf16idx(str, 2, v:true))
+  call assert_equal(2, utf16idx(str, 3, v:true))
+  call assert_equal(3, utf16idx(str, 4, v:true))
+  call assert_equal(4, utf16idx(str, 5, v:true))
+  call assert_equal(5, utf16idx(str, 6, v:true))
+  call assert_equal(5, utf16idx(str, 7, v:true))
+  call assert_equal(-1, utf16idx(str, 8, v:true))
+
+  " string with multiple composing characters
+  let str = '-ą́-ą́'
+  call assert_equal(0, utf16idx(str, 0))
+  call assert_equal(1, utf16idx(str, 1))
+  call assert_equal(1, utf16idx(str, 2))
+  call assert_equal(1, utf16idx(str, 3))
+  call assert_equal(1, utf16idx(str, 4))
+  call assert_equal(1, utf16idx(str, 5))
+  call assert_equal(2, utf16idx(str, 6))
+  call assert_equal(3, utf16idx(str, 7))
+  call assert_equal(3, utf16idx(str, 8))
+  call assert_equal(3, utf16idx(str, 9))
+  call assert_equal(3, utf16idx(str, 10))
+  call assert_equal(3, utf16idx(str, 11))
+  call assert_equal(-1, utf16idx(str, 12))
+  call assert_equal(0, utf16idx(str, 0, v:true))
+  call assert_equal(1, utf16idx(str, 1, v:true))
+  call assert_equal(2, utf16idx(str, 2, v:true))
+  call assert_equal(2, utf16idx(str, 3, v:true))
+  call assert_equal(3, utf16idx(str, 4, v:true))
+  call assert_equal(3, utf16idx(str, 5, v:true))
+  call assert_equal(4, utf16idx(str, 6, v:true))
+  call assert_equal(5, utf16idx(str, 7, v:true))
+  call assert_equal(6, utf16idx(str, 8, v:true))
+  call assert_equal(6, utf16idx(str, 9, v:true))
+  call assert_equal(7, utf16idx(str, 10, v:true))
+  call assert_equal(7, utf16idx(str, 11, v:true))
+  call assert_equal(-1, utf16idx(str, 12, v:true))
+
+  " empty string
+  call assert_equal(-1, utf16idx('', 0))
+  call assert_equal(-1, utf16idx('', 0, v:true))
+
+  " error cases
+  call assert_equal(-1, utf16idx("", 0))
+  call assert_equal(-1, utf16idx("abc", -1))
+  call assert_equal(-1, utf16idx(v:_null_string, 0))
+  call assert_fails('let l = utf16idx([], 0)', 'E1174:')
+  call assert_fails('let l = utf16idx("ab", [])', 'E1210:')
+  call assert_fails('let l = utf16idx("ab", 0, [])', 'E1212:')
+endfunc
+
+" Test for utf16idx() using a character index
+func Test_utf16idx_from_charidx()
+  let str = "abc"
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 3, v:false, v:true))
+
+  " UTF-16 index of a string with two byte characters
+  let str = "a©©b"
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+
+  " UTF-16 index of a string with four byte characters
+  let str = "a😊😊b"
+  call assert_equal(0, utf16idx(str, 0, v:false, v:true))
+  call assert_equal(2, utf16idx(str, 1, v:false, v:true))
+  call assert_equal(4, utf16idx(str, 2, v:false, v:true))
+  call assert_equal(5, utf16idx(str, 3, v:false, v:true))
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+
+  " UTF-16 index of a string with composing characters
+  let str = '-á-b́'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, utf16idx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 6, v:true, v:true))
+
+  " string with multiple composing characters
+  let str = '-ą́-ą́'
+  for i in str->strcharlen()->range()
+    call assert_equal(i, utf16idx(str, i, v:false, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 4, v:false, v:true))
+  for i in str->strchars()->range()
+    call assert_equal(i, utf16idx(str, i, v:true, v:true))
+  endfor
+  call assert_equal(-1, utf16idx(str, 8, v:true, v:true))
+
+  " empty string
+  call assert_equal(-1, utf16idx('', 0, v:false, v:true))
+  call assert_equal(-1, utf16idx('', 0, v:true, v:true))
+
+  " error cases
+  call assert_equal(-1, utf16idx(v:_null_string, 0, v:true, v:true))
+  call assert_fails('let l = utf16idx("ab", 0, v:false, [])', 'E1212:')
+endfunc
+
+" Test for strutf16len()
+func Test_strutf16len()
+  call assert_equal(3, strutf16len('abc'))
+  call assert_equal(3, 'abc'->strutf16len(v:true))
+  call assert_equal(4, strutf16len('a©©b'))
+  call assert_equal(4, strutf16len('a©©b', v:true))
+  call assert_equal(6, strutf16len('a😊😊b'))
+  call assert_equal(6, strutf16len('a😊😊b', v:true))
+  call assert_equal(4, strutf16len('-á-b́'))
+  call assert_equal(6, strutf16len('-á-b́', v:true))
+  call assert_equal(4, strutf16len('-ą́-ą́'))
+  call assert_equal(8, strutf16len('-ą́-ą́', v:true))
+  call assert_equal(0, strutf16len(''))
+
+  " error cases
+  call assert_fails('let l = strutf16len([])', 'E1174:')
+  call assert_fails('let l = strutf16len("a", [])', 'E1212:')
+  call assert_equal(0, strutf16len(v:_null_string))
+endfunc
+
 func Test_count()
   let l = ['a', 'a', 'A', 'b']
   call assert_equal(2, count(l, 'a'))
@@ -2644,5 +3039,4 @@ func Test_delfunc_while_listing()
   call StopVimInTerminal(buf)
 endfunc
 
-
 " vim: shiftwidth=2 sts=2 expandtab
author	zeertzjq <zeertzjq@outlook.com>	2023-04-26 09:50:37 +0800
committer	GitHub <noreply@github.com>	2023-04-26 09:50:37 +0800
commit	191e8b40625731a652bade7000911554834afe5f (patch)
tree	ba2267f67dc2fc833de914708efde646d9df466b
parent	8af97ecefa71c6391a52ab799d354e058cb470be (diff)
download	rneovim-191e8b40625731a652bade7000911554834afe5f.tar.gz rneovim-191e8b40625731a652bade7000911554834afe5f.tar.bz2 rneovim-191e8b40625731a652bade7000911554834afe5f.zip