From 230b0c7f021a57647a658edce27fe115343f083f Mon Sep 17 00:00:00 2001 From: Tristan Knight Date: Wed, 23 Oct 2024 14:33:57 +0100 Subject: feat(stdlib): overload vim.str_byteindex, vim.str_utfindex #30735 PROBLEM: There are several limitations to vim.str_byteindex, vim.str_utfindex: 1. They throw given out-of-range indexes. An invalid (often user/lsp-provided) index doesn't feel exceptional and should be handled by the caller. `:help dev-error-patterns` suggests that `retval, errmsg` is the preferred way to handle this kind of failure. 2. They cannot accept an encoding. So LSP needs wrapper functions. #25272 3. The current signatures are not extensible. * Calling: The function currently uses a fairly opaque boolean value to indicate to identify the encoding. * Returns: The fact it can throw requires wrapping in pcall. 4. The current name doesn't follow suggestions in `:h dev-naming` and I think `get` would be suitable. SOLUTION: - Because these are performance-sensitive, don't introduce `opts`. - Introduce an "overload" that accepts `encoding:string` and `strict_indexing:bool` params. ```lua local col = vim.str_utfindex(line, encoding, [index, [no_out_of_range]]) ``` Support the old versions by dispatching on the type of argument 2, and deprecate that form. ```lua vim.str_utfindex(line) -- (utf-32 length, utf-16 length), deprecated vim.str_utfindex(line, index) -- (utf-32 index, utf-16 index), deprecated vim.str_utfindex(line, 'utf-16') -- utf-16 length vim.str_utfindex(line, 'utf-16', index) -- utf-16 index vim.str_utfindex(line, 'utf-16', math.huge) -- error: index out of range vim.str_utfindex(line, 'utf-16', math.huge, false) -- utf-16 length ``` --- runtime/lua/vim/_editor.lua | 128 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 1 deletion(-) (limited to 'runtime/lua/vim/_editor.lua') diff --git a/runtime/lua/vim/_editor.lua b/runtime/lua/vim/_editor.lua index 58283ac64b..496bbf747c 100644 --- a/runtime/lua/vim/_editor.lua +++ b/runtime/lua/vim/_editor.lua @@ -68,6 +68,12 @@ vim.log = { }, } +local utfs = { + ['utf-8'] = true, + ['utf-16'] = true, + ['utf-32'] = true, +} + -- TODO(lewis6991): document that the signature is system({cmd}, [{opts},] {on_exit}) --- Runs a system command or throws an error if {cmd} cannot be run. --- @@ -714,7 +720,127 @@ function vim._on_key(buf, typed_buf) end end ---- Generates a list of possible completions for the string. +--- Convert UTF-32, UTF-16 or UTF-8 {index} to byte index. +--- If {strict_indexing} is false +--- then then an out of range index will return byte length +--- instead of throwing an error. +--- +--- Invalid UTF-8 and NUL is treated like in |vim.str_utfindex()|. +--- An {index} in the middle of a UTF-16 sequence is rounded upwards to +--- the end of that sequence. +---@param s string +---@param encoding "utf-8"|"utf-16"|"utf-32" +---@param index integer +---@param strict_indexing? boolean # default: true +---@return integer +function vim.str_byteindex(s, encoding, index, strict_indexing) + if type(encoding) == 'number' then + -- Legacy support for old API + -- Parameters: ~ + -- • {str} (`string`) + -- • {index} (`integer`) + -- • {use_utf16} (`boolean?`) + local old_index = encoding + local use_utf16 = index or false + return vim.__str_byteindex(s, old_index, use_utf16) or error('index out of range') + end + + vim.validate('s', s, 'string') + vim.validate('index', index, 'number') + + local len = #s + + if index == 0 or len == 0 then + return 0 + end + + vim.validate('encoding', encoding, function(v) + return utfs[v], 'invalid encoding' + end) + + vim.validate('strict_indexing', strict_indexing, 'boolean', true) + if strict_indexing == nil then + strict_indexing = true + end + + if encoding == 'utf-8' then + if index > len then + return strict_indexing and error('index out of range') or len + end + return index + end + return vim.__str_byteindex(s, index, encoding == 'utf-16') + or strict_indexing and error('index out of range') + or len +end + +--- Convert byte index to UTF-32, UTF-16 or UTF-8 indices. If {index} is not +--- supplied, the length of the string is used. All indices are zero-based. +--- +--- If {strict_indexing} is false then an out of range index will return string +--- length instead of throwing an error. +--- Invalid UTF-8 bytes, and embedded surrogates are counted as one code point +--- each. An {index} in the middle of a UTF-8 sequence is rounded upwards to the end of +--- that sequence. +---@param s string +---@param encoding "utf-8"|"utf-16"|"utf-32" +---@param index? integer +---@param strict_indexing? boolean # default: true +---@return integer +function vim.str_utfindex(s, encoding, index, strict_indexing) + if encoding == nil or type(encoding) == 'number' then + -- Legacy support for old API + -- Parameters: ~ + -- • {str} (`string`) + -- • {index} (`integer?`) + local old_index = encoding + local col32, col16 = vim.__str_utfindex(s, old_index) --[[@as integer,integer]] + if not col32 or not col16 then + error('index out of range') + end + -- Return (multiple): ~ + -- (`integer`) UTF-32 index + -- (`integer`) UTF-16 index + return col32, col16 + end + + vim.validate('s', s, 'string') + vim.validate('index', index, 'number', true) + if not index then + index = math.huge + strict_indexing = false + end + + if index == 0 then + return 0 + end + + vim.validate('encoding', encoding, function(v) + return utfs[v], 'invalid encoding' + end) + + vim.validate('strict_indexing', strict_indexing, 'boolean', true) + if strict_indexing == nil then + strict_indexing = true + end + + if encoding == 'utf-8' then + local len = #s + return index <= len and index or (strict_indexing and error('index out of range') or len) + end + local col32, col16 = vim.__str_utfindex(s, index) --[[@as integer?,integer?]] + local col = encoding == 'utf-16' and col16 or col32 + if col then + return col + end + if strict_indexing then + error('index out of range') + end + local max32, max16 = vim.__str_utfindex(s)--[[@as integer integer]] + return encoding == 'utf-16' and max16 or max32 +end + +--- Generates a list of possible completions for the str --- String has the pattern. --- --- 1. Can we get it to just return things in the global namespace with that name prefix -- cgit