From 230b0c7f021a57647a658edce27fe115343f083f Mon Sep 17 00:00:00 2001
From: Tristan Knight <admin@snappeh.com>
Date: Wed, 23 Oct 2024 14:33:57 +0100
Subject: feat(stdlib): overload vim.str_byteindex, vim.str_utfindex #30735

PROBLEM:
There are several limitations to vim.str_byteindex, vim.str_utfindex:
1. They throw given out-of-range indexes. An invalid (often user/lsp-provided)
   index doesn't feel exceptional and should be handled by the caller.
   `:help dev-error-patterns` suggests that `retval, errmsg` is the preferred
   way to handle this kind of failure.
2. They cannot accept an encoding. So LSP needs wrapper functions. #25272
3. The current signatures are not extensible.
    * Calling: The function currently uses a fairly opaque boolean value to
      indicate to identify the encoding.
    * Returns: The fact it can throw requires wrapping in pcall.
4. The current name doesn't follow suggestions in `:h dev-naming` and I think
   `get` would be suitable.

SOLUTION:
- Because these are performance-sensitive, don't introduce `opts`.
- Introduce an "overload" that accepts `encoding:string` and
  `strict_indexing:bool` params.

```lua
local col = vim.str_utfindex(line, encoding, [index, [no_out_of_range]])
```

Support the old versions by dispatching on the type of argument 2, and
deprecate that form.

```lua
vim.str_utfindex(line)                             -- (utf-32 length, utf-16 length), deprecated
vim.str_utfindex(line, index)                      -- (utf-32 index, utf-16 index), deprecated
vim.str_utfindex(line, 'utf-16')                   -- utf-16 length
vim.str_utfindex(line, 'utf-16', index)            -- utf-16 index
vim.str_utfindex(line, 'utf-16', math.huge)        -- error: index out of range
vim.str_utfindex(line, 'utf-16', math.huge, false) -- utf-16 length
```
---
 runtime/doc/lua.txt | 71 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

(limited to 'runtime/doc')

diff --git a/runtime/doc/lua.txt b/runtime/doc/lua.txt
index 9d422026e6..c873523d9d 100644
--- a/runtime/doc/lua.txt
+++ b/runtime/doc/lua.txt
@@ -1000,22 +1000,6 @@ vim.schedule({fn})                                            *vim.schedule()*
     Parameters: ~
       • {fn}  (`fun()`)
 
-vim.str_byteindex({str}, {index}, {use_utf16})           *vim.str_byteindex()*
-    Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not
-    supplied, it defaults to false (use UTF-32). Returns the byte index.
-
-    Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index}
-    in the middle of a UTF-16 sequence is rounded upwards to the end of that
-    sequence.
-
-    Parameters: ~
-      • {str}        (`string`)
-      • {index}      (`integer`)
-      • {use_utf16}  (`boolean?`)
-
-    Return: ~
-        (`integer`)
-
 vim.str_utf_end({str}, {index})                            *vim.str_utf_end()*
     Gets the distance (in bytes) from the last byte of the codepoint
     (character) that {index} points to.
@@ -1073,23 +1057,6 @@ vim.str_utf_start({str}, {index})                        *vim.str_utf_start()*
     Return: ~
         (`integer`)
 
-vim.str_utfindex({str}, {index})                          *vim.str_utfindex()*
-    Convert byte index to UTF-32 and UTF-16 indices. If {index} is not
-    supplied, the length of the string is used. All indices are zero-based.
-
-    Embedded NUL bytes are treated as terminating the string. Invalid UTF-8
-    bytes, and embedded surrogates are counted as one code point each. An
-    {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
-    that sequence.
-
-    Parameters: ~
-      • {str}    (`string`)
-      • {index}  (`integer?`)
-
-    Return (multiple): ~
-        (`integer`) UTF-32 index
-        (`integer`) UTF-16 index
-
 vim.stricmp({a}, {b})                                          *vim.stricmp()*
     Compares strings case-insensitively.
 
@@ -1776,6 +1743,44 @@ vim.schedule_wrap({fn})                                  *vim.schedule_wrap()*
       • |vim.schedule()|
       • |vim.in_fast_event()|
 
+                                                         *vim.str_byteindex()*
+vim.str_byteindex({s}, {encoding}, {index}, {strict_indexing})
+    Convert UTF-32, UTF-16 or UTF-8 {index} to byte index. If
+    {strict_indexing} is false then then an out of range index will return
+    byte length instead of throwing an error.
+
+    Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. An {index}
+    in the middle of a UTF-16 sequence is rounded upwards to the end of that
+    sequence.
+
+    Parameters: ~
+      • {s}                (`string`)
+      • {encoding}         (`"utf-8"|"utf-16"|"utf-32"`)
+      • {index}            (`integer`)
+      • {strict_indexing}  (`boolean?`) default: true
+
+    Return: ~
+        (`integer`)
+
+                                                          *vim.str_utfindex()*
+vim.str_utfindex({s}, {encoding}, {index}, {strict_indexing})
+    Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not
+    supplied, the length of the string is used. All indices are zero-based.
+
+    If {strict_indexing} is false then an out of range index will return
+    string length instead of throwing an error. Invalid UTF-8 bytes, and
+    embedded surrogates are counted as one code point each. An {index} in the
+    middle of a UTF-8 sequence is rounded upwards to the end of that sequence.
+
+    Parameters: ~
+      • {s}                (`string`)
+      • {encoding}         (`"utf-8"|"utf-16"|"utf-32"`)
+      • {index}            (`integer?`)
+      • {strict_indexing}  (`boolean?`) default: true
+
+    Return: ~
+        (`integer`)
+
 vim.system({cmd}, {opts}, {on_exit})                            *vim.system()*
     Runs a system command or throws an error if {cmd} cannot be run.
 
-- 
cgit