feat(stdlib): overload vim.str_byteindex, vim.str_utfindex #30735

PROBLEM: There are several limitations to vim.str_byteindex, vim.str_utfindex: 1. They throw given out-of-range indexes. An invalid (often user/lsp-provided) index doesn't feel exceptional and should be handled by the caller. `:help dev-error-patterns` suggests that `retval, errmsg` is the preferred way to handle this kind of failure. 2. They cannot accept an encoding. So LSP needs wrapper functions. #25272 3. The current signatures are not extensible. * Calling: The function currently uses a fairly opaque boolean value to indicate to identify the encoding. * Returns: The fact it can throw requires wrapping in pcall. 4. The current name doesn't follow suggestions in `:h dev-naming` and I think `get` would be suitable. SOLUTION: - Because these are performance-sensitive, don't introduce `opts`. - Introduce an "overload" that accepts `encoding:string` and `strict_indexing:bool` params. ```lua local col = vim.str_utfindex(line, encoding, [index, [no_out_of_range]]) ``` Support the old versions by dispatching on the type of argument 2, and deprecate that form. ```lua vim.str_utfindex(line) -- (utf-32 length, utf-16 length), deprecated vim.str_utfindex(line, index) -- (utf-32 index, utf-16 index), deprecated vim.str_utfindex(line, 'utf-16') -- utf-16 length vim.str_utfindex(line, 'utf-16', index) -- utf-16 index vim.str_utfindex(line, 'utf-16', math.huge) -- error: index out of range vim.str_utfindex(line, 'utf-16', math.huge, false) -- utf-16 length ```
author: Tristan Knight <admin@snappeh.com> 2024-10-23 14:33:57 +0100
committer: GitHub <noreply@github.com> 2024-10-23 06:33:57 -0700
commit: 230b0c7f021a57647a658edce27fe115343f083f (patch)
tree: 49bcf13151da2bc140408ce2fb173b782614a0ca /runtime/lua/vim
parent: 3a86b60032bd659c2b12e984abb40cee93568558 (diff)
download: rneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.gz
rneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.bz2
rneovim-230b0c7f021a57647a658edce27fe115343f083f.zip
2 files changed, 127 insertions, 26 deletions
diff --git a/runtime/lua/vim/_editor.lua b/runtime/lua/vim/_editor.lua
index 58283ac64b..496bbf747c 100644
--- a/runtime/lua/vim/_editor.lua
+++ b/runtime/lua/vim/_editor.lua
@@ -68,6 +68,12 @@ vim.log = {
   },
 }
 
+local utfs = {
+  ['utf-8'] = true,
+  ['utf-16'] = true,
+  ['utf-32'] = true,
+}
+
 -- TODO(lewis6991): document that the signature is system({cmd}, [{opts},] {on_exit})
 --- Runs a system command or throws an error if {cmd} cannot be run.
 ---
@@ -714,7 +720,127 @@ function vim._on_key(buf, typed_buf)
   end
 end
 
---- Generates a list of possible completions for the string.
+--- Convert UTF-32, UTF-16 or UTF-8 {index} to byte index.
+--- If {strict_indexing} is false
+--- then then an out of range index will return byte length
+--- instead of throwing an error.
+---
+--- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|.
+--- An {index} in the middle of a UTF-16 sequence is rounded upwards to
+--- the end of that sequence.
+---@param s string
+---@param encoding "utf-8"|"utf-16"|"utf-32"
+---@param index integer
+---@param strict_indexing? boolean # default: true
+---@return integer
+function vim.str_byteindex(s, encoding, index, strict_indexing)
+  if type(encoding) == 'number' then
+    -- Legacy support for old API
+    -- Parameters: ~
+    --   • {str}        (`string`)
+    --   • {index}      (`integer`)
+    --   • {use_utf16}  (`boolean?`)
+    local old_index = encoding
+    local use_utf16 = index or false
+    return vim.__str_byteindex(s, old_index, use_utf16) or error('index out of range')
+  end
+
+  vim.validate('s', s, 'string')
+  vim.validate('index', index, 'number')
+
+  local len = #s
+
+  if index == 0 or len == 0 then
+    return 0
+  end
+
+  vim.validate('encoding', encoding, function(v)
+    return utfs[v], 'invalid encoding'
+  end)
+
+  vim.validate('strict_indexing', strict_indexing, 'boolean', true)
+  if strict_indexing == nil then
+    strict_indexing = true
+  end
+
+  if encoding == 'utf-8' then
+    if index > len then
+      return strict_indexing and error('index out of range') or len
+    end
+    return index
+  end
+  return vim.__str_byteindex(s, index, encoding == 'utf-16')
+    or strict_indexing and error('index out of range')
+    or len
+end
+
+--- Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not
+--- supplied, the length of the string is used. All indices are zero-based.
+---
+--- If {strict_indexing} is false then an out of range index will return string
+--- length instead of throwing an error.
+--- Invalid UTF-8 bytes, and embedded surrogates are counted as one code point
+--- each. An {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
+--- that sequence.
+---@param s string
+---@param encoding "utf-8"|"utf-16"|"utf-32"
+---@param index? integer
+---@param strict_indexing? boolean # default: true
+---@return integer
+function vim.str_utfindex(s, encoding, index, strict_indexing)
+  if encoding == nil or type(encoding) == 'number' then
+    -- Legacy support for old API
+    -- Parameters: ~
+    --   • {str}    (`string`)
+    --   • {index}  (`integer?`)
+    local old_index = encoding
+    local col32, col16 = vim.__str_utfindex(s, old_index) --[[@as integer,integer]]
+    if not col32 or not col16 then
+      error('index out of range')
+    end
+    -- Return (multiple): ~
+    --     (`integer`) UTF-32 index
+    --     (`integer`) UTF-16 index
+    return col32, col16
+  end
+
+  vim.validate('s', s, 'string')
+  vim.validate('index', index, 'number', true)
+  if not index then
+    index = math.huge
+    strict_indexing = false
+  end
+
+  if index == 0 then
+    return 0
+  end
+
+  vim.validate('encoding', encoding, function(v)
+    return utfs[v], 'invalid encoding'
+  end)
+
+  vim.validate('strict_indexing', strict_indexing, 'boolean', true)
+  if strict_indexing == nil then
+    strict_indexing = true
+  end
+
+  if encoding == 'utf-8' then
+    local len = #s
+    return index <= len and index or (strict_indexing and error('index out of range') or len)
+  end
+  local col32, col16 = vim.__str_utfindex(s, index) --[[@as integer?,integer?]]
+  local col = encoding == 'utf-16' and col16 or col32
+  if col then
+    return col
+  end
+  if strict_indexing then
+    error('index out of range')
+  end
+  local max32, max16 = vim.__str_utfindex(s)--[[@as integer integer]]
+  return encoding == 'utf-16' and max16 or max32
+end
+
+--- Generates a list of possible completions for the str
 --- String has the pattern.
 ---
 --- 1. Can we get it to just return things in the global namespace with that name prefix
diff --git a/runtime/lua/vim/_meta/builtin.lua b/runtime/lua/vim/_meta/builtin.lua
index 13bd1c1294..234c75d38f 100644
--- a/runtime/lua/vim/_meta/builtin.lua
+++ b/runtime/lua/vim/_meta/builtin.lua
@@ -112,18 +112,6 @@ function vim.rpcrequest(channel, method, ...) end
 --- equal, {a} is greater than {b} or {a} is lesser than {b}, respectively.
 function vim.stricmp(a, b) end
 
---- Convert UTF-32 or UTF-16 {index} to byte index. If {use_utf16} is not
---- supplied, it defaults to false (use UTF-32). Returns the byte index.
----
---- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|.
---- An {index} in the middle of a UTF-16 sequence is rounded upwards to
---- the end of that sequence.
---- @param str string
---- @param index integer
---- @param use_utf16? boolean
---- @return integer
-function vim.str_byteindex(str, index, use_utf16) end
-
 --- Gets a list of the starting byte positions of each UTF-8 codepoint in the given string.
 ---
 --- Embedded NUL bytes are treated as terminating the string.
@@ -173,19 +161,6 @@ function vim.str_utf_start(str, index) end
 --- @return integer
 function vim.str_utf_end(str, index) end
 
---- Convert byte index to UTF-32 and UTF-16 indices. If {index} is not
---- supplied, the length of the string is used. All indices are zero-based.
----
---- Embedded NUL bytes are treated as terminating the string. Invalid UTF-8
---- bytes, and embedded surrogates are counted as one code point each. An
---- {index} in the middle of a UTF-8 sequence is rounded upwards to the end of
---- that sequence.
---- @param str string
---- @param index? integer
---- @return integer # UTF-32 index
---- @return integer # UTF-16 index
-function vim.str_utfindex(str, index) end
-
 --- The result is a String, which is the text {str} converted from
 --- encoding {from} to encoding {to}. When the conversion fails `nil` is
 --- returned.  When some characters could not be converted they
author	Tristan Knight <admin@snappeh.com>	2024-10-23 14:33:57 +0100
committer	GitHub <noreply@github.com>	2024-10-23 06:33:57 -0700
commit	230b0c7f021a57647a658edce27fe115343f083f (patch)
tree	49bcf13151da2bc140408ce2fb173b782614a0ca /runtime/lua/vim
parent	3a86b60032bd659c2b12e984abb40cee93568558 (diff)
download	rneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.gz rneovim-230b0c7f021a57647a658edce27fe115343f083f.tar.bz2 rneovim-230b0c7f021a57647a658edce27fe115343f083f.zip