diff options
author | Michael Lingelbach <m.j.lbach@gmail.com> | 2021-11-09 14:37:48 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-11-09 14:37:48 -0800 |
commit | 2ecf0a4c6183bba1c65d440711038f040d355fef (patch) | |
tree | 7774b0cebcf60039cf0232723a65840e784e7af2 /runtime/lua/vim/lsp/sync.lua | |
parent | 953ae71fd324eb1a263d2b7435cc15756b44ac2d (diff) | |
download | rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.tar.gz rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.tar.bz2 rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.zip |
fix(lsp): rewrite incremental sync (#16252)
* use codeunits/points instead of byte ranges when applicable
* take into account different file formats when computing range and
sending text (dos, unix, and mac supported)
* add tests of incremental sync
Diffstat (limited to 'runtime/lua/vim/lsp/sync.lua')
-rw-r--r-- | runtime/lua/vim/lsp/sync.lua | 381 |
1 files changed, 381 insertions, 0 deletions
diff --git a/runtime/lua/vim/lsp/sync.lua b/runtime/lua/vim/lsp/sync.lua new file mode 100644 index 0000000000..37247c61b9 --- /dev/null +++ b/runtime/lua/vim/lsp/sync.lua @@ -0,0 +1,381 @@ +-- Notes on incremental sync: +-- Per the protocol, the text range should be: +-- +-- A position inside a document (see Position definition below) is expressed as +-- a zero-based line and character offset. The offsets are based on a UTF-16 +-- string representation. So a string of the form a𐐀b the character offset +-- of the character a is 0, the character offset of 𐐀 is 1 and the character +-- offset of b is 3 since 𐐀 is represented using two code units in UTF-16. +-- +-- To ensure that both client and server split the string into the same line +-- representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’. +-- +-- Positions are line end character agnostic. So you can not specify a position that +-- denotes \r|\n or \n| where | represents the character offset. This means *no* defining +-- a range than ends on the same line after a terminating character +-- +-- Generic warnings about byte level changes in neovim. Many apparently "single" +-- operations in on_lines callbacks are actually multiple operations. +-- +-- Join operation (2 operations): +-- * extends line 1 with the contents of line 2 +-- * deletes line 2 +-- +-- test 1 test 1 test 2 test 1 test 2 +-- test 2 -> test 2 -> test 3 +-- test 3 test 3 +-- +-- Deleting (and undoing) two middle lines (1 operation): +-- +-- test 1 test 1 +-- test 2 -> test 4 +-- test 3 +-- test 4 +-- +-- Deleting partial lines (5 operations) deleting between asterisks below: +-- +-- test *1 test * test * test * test *4 test *4* +-- test 2 -> test 2 -> test *4 -> *4 -> *4 -> +-- test 3 test 3 +-- test *4 test 4 + +local M = {} + +-- local string.byte, unclear if this is necessary for JIT compilation +local str_byte = string.byte +local min = math.min +local str_utfindex = vim.str_utfindex +local str_utf_start = vim.str_utf_start +local str_utf_end = vim.str_utf_end + +---@private +-- Given a line, byte idx, and offset_encoding convert to the +-- utf-8, utf-16, or utf-32 index. +---@param line string the line to index into +---@param byte integer the byte idx +---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8) +--@returns integer the utf idx for the given encoding +local function byte_to_utf(line, byte, offset_encoding) + -- convert to 0 based indexing for str_utfindex + byte = byte - 1 + + local utf_idx + local _ + -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based + if offset_encoding == 'utf-16' then + _, utf_idx = str_utfindex(line, byte) + elseif offset_encoding == 'utf-32' then + utf_idx, _ = str_utfindex(line, byte) + else + utf_idx = byte + end + + -- convert to 1 based indexing + return utf_idx + 1 +end + +---@private +-- Given a line, byte idx, alignment, and offset_encoding convert to the aligned +-- utf-8 index and either the utf-16, or utf-32 index. +---@param line string the line to index into +---@param byte integer the byte idx +---@param align string when dealing with multibyte characters, +-- to choose the start of the current character or the beginning of the next. +-- Used for incremental sync for start/end range respectively +---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8) +---@returns table<string, int> byte_idx and char_idx of first change position +local function align_position(line, byte, align, offset_encoding) + local char + -- If on the first byte, or an empty string: the trivial case + if byte == 1 or #line == 0 then + char = byte + -- Called in the case of extending an empty line "" -> "a" + elseif byte == #line + 1 then + byte = byte + str_utf_end(line, #line) + char = byte_to_utf(line, byte, offset_encoding) + else + -- Modifying line, find the nearest utf codepoint + if align == 'start' then + byte = byte + str_utf_start(line, byte) + char = byte_to_utf(line, byte, offset_encoding) + elseif align == 'end' then + local offset = str_utf_end(line, byte) + -- If the byte does not fall on the start of the character, then + -- align to the start of the next character. + if offset > 0 then + char = byte_to_utf(line, byte, offset_encoding) + 1 + byte = byte + offset + else + char = byte_to_utf(line, byte, offset_encoding) + byte = byte + offset + end + else + error('`align` must be start or end.') + end + -- Extending line, find the nearest utf codepoint for the last valid character + end + return byte, char +end + +---@private +--- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint. +---@param prev_lines table list of lines from previous buffer +---@param curr_lines table list of lines from current buffer +---@param firstline integer firstline from on_lines, adjusted to 1-index +---@param lastline integer lastline from on_lines, adjusted to 1-index +---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index +---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8) +---@returns table<int, int> line_idx, byte_idx, and char_idx of first change position +local function compute_start_range(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding) + -- If firstline == lastline, no existing text is changed. All edit operations + -- occur on a new line pointed to by lastline. This occurs during insertion of + -- new lines(O), the new newline is inserted at the line indicated by + -- new_lastline. + -- If firstline == new_lastline, the first change occured on a line that was deleted. + -- In this case, the first byte change is also at the first byte of firstline + if firstline == new_lastline or firstline == lastline then + return { line_idx = firstline, byte_idx = 1, char_idx = 1 } + end + + local prev_line = prev_lines[firstline] + local curr_line = curr_lines[firstline] + + -- Iterate across previous and current line containing first change + -- to find the first different byte. + -- Note: *about -> a*about will register the second a as the first + -- difference, regardless of edit since we do not receive the first + -- column of the edit from on_lines. + local start_byte_idx = 1 + for idx = 1, #prev_line + 1 do + start_byte_idx = idx + if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then + break + end + end + + -- Convert byte to codepoint if applicable + local byte_idx, char_idx = align_position(prev_line, start_byte_idx, 'start', offset_encoding) + + -- Return the start difference (shared for new and prev lines) + return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx } +end + +---@private +--- Finds the last line and byte index of the differences between prev and current buffer. +--- Normalized to the next codepoint. +--- prev_end_range is the text range sent to the server representing the changed region. +--- curr_end_range is the text that should be collected and sent to the server. +-- +---@param prev_lines table list of lines +---@param curr_lines table list of lines +---@param start_range table +---@param lastline integer +---@param new_lastline integer +---@param offset_encoding string +---@returns (int, int) end_line_idx and end_col_idx of range +local function compute_end_range(prev_lines, curr_lines, start_range, firstline, lastline, new_lastline, offset_encoding) + -- If firstline == new_lastline, the first change occured on a line that was deleted. + -- In this case, the last_byte... + if firstline == new_lastline then + return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, { line_idx = firstline, byte_idx = 1, char_idx = 1 } + end + if firstline == lastline then + return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, { line_idx = new_lastline - lastline + firstline, byte_idx = 1, char_idx = 1 } + end + -- Compare on last line, at minimum will be the start range + local start_line_idx = start_range.line_idx + + -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines + local prev_line_idx = lastline - 1 + local curr_line_idx = new_lastline - 1 + + local prev_line = prev_lines[lastline - 1] + local curr_line = curr_lines[new_lastline - 1] + + local prev_line_length = #prev_line + local curr_line_length = #curr_line + + local byte_offset = 0 + + -- Editing the same line + -- If the byte offset is zero, that means there is a difference on the last byte (not newline) + if prev_line_idx == curr_line_idx then + local max_length + if start_line_idx == prev_line_idx then + -- Search until beginning of difference + max_length = min(prev_line_length - start_range.byte_idx, curr_line_length - start_range.byte_idx) + 1 + else + max_length = min(prev_line_length, curr_line_length) + 1 + end + for idx = 0, max_length do + byte_offset = idx + if + str_byte(prev_line, prev_line_length - byte_offset) ~= str_byte(curr_line, curr_line_length - byte_offset) + then + break + end + end + end + + -- Iterate from end to beginning of shortest line + local prev_end_byte_idx = prev_line_length - byte_offset + 1 + local prev_byte_idx, prev_char_idx = align_position(prev_line, prev_end_byte_idx, 'start', offset_encoding) + local prev_end_range = { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx } + + local curr_end_range + -- Deletion event, new_range cannot be before start + if curr_line_idx < start_line_idx then + curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 } + else + local curr_end_byte_idx = curr_line_length - byte_offset + 1 + local curr_byte_idx, curr_char_idx = align_position(curr_line, curr_end_byte_idx, 'start', offset_encoding) + curr_end_range = { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx } + end + + return prev_end_range, curr_end_range +end + +---@private +--- Get the text of the range defined by start and end line/column +---@param lines table list of lines +---@param start_range table table returned by first_difference +---@param end_range table new_end_range returned by last_difference +---@returns string text extracted from defined region +local function extract_text(lines, start_range, end_range, line_ending) + if not lines[start_range.line_idx] then + return "" + end + -- Trivial case: start and end range are the same line, directly grab changed text + if start_range.line_idx == end_range.line_idx then + -- string.sub is inclusive, end_range is not + return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1) + + else + -- Handle deletion case + -- Collect the changed portion of the first changed line + local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) } + + -- Collect the full line for intermediate lines + for idx = start_range.line_idx + 1, end_range.line_idx - 1 do + table.insert(result, lines[idx]) + end + + if lines[end_range.line_idx] then + -- Collect the changed portion of the last changed line. + table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1)) + else + table.insert(result, "") + end + + -- Add line ending between all lines + return table.concat(result, line_ending) + end +end + +local function compute_line_length(line, offset_encoding) + local length + local _ + if offset_encoding == 'utf-16' then + _, length = str_utfindex(line) + elseif offset_encoding == 'utf-32' then + length, _ = str_utfindex(line) + else + length = #line + end + return length +end +---@private +-- rangelength depends on the offset encoding +-- bytes for utf-8 (clangd with extenion) +-- codepoints for utf-16 +-- codeunits for utf-32 +-- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac) +-- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior) +local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending) + local line_ending_length = #line_ending + -- Single line case + if start_range.line_idx == end_range.line_idx then + return end_range.char_idx - start_range.char_idx + end + + local start_line = lines[start_range.line_idx] + local range_length + if start_line and #start_line > 0 then + range_length = compute_line_length(start_line, offset_encoding) - start_range.char_idx + 1 + line_ending_length + else + -- Length of newline character + range_length = line_ending_length + end + + -- The first and last range of the line idx may be partial lines + for idx = start_range.line_idx + 1, end_range.line_idx - 1 do + -- Length full line plus newline character + if #lines[idx] > 0 then + range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending + else + range_length = range_length + line_ending_length + end + end + + local end_line = lines[end_range.line_idx] + if end_line and #end_line > 0 then + range_length = range_length + end_range.char_idx - 1 + end + + return range_length +end + +--- Returns the range table for the difference between prev and curr lines +---@param prev_lines table list of lines +---@param curr_lines table list of lines +---@param firstline number line to begin search for first difference +---@param lastline number line to begin search in old_lines for last difference +---@param new_lastline number line to begin search in new_lines for last difference +---@param offset_encoding string encoding requested by language server +---@returns table TextDocumentContentChangeEvent see https://microsoft.github.io/language-server-protocol/specifications/specification-3-17/#textDocumentContentChangeEvent +function M.compute_diff(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding, line_ending) + -- Find the start of changes between the previous and current buffer. Common between both. + -- Sent to the server as the start of the changed range. + -- Used to grab the changed text from the latest buffer. + local start_range = compute_start_range( + prev_lines, + curr_lines, + firstline + 1, + lastline + 1, + new_lastline + 1, + offset_encoding + ) + -- Find the last position changed in the previous and current buffer. + -- prev_end_range is sent to the server as as the end of the changed range. + -- curr_end_range is used to grab the changed text from the latest buffer. + local prev_end_range, curr_end_range = compute_end_range( + prev_lines, + curr_lines, + start_range, + firstline + 1, + lastline + 1, + new_lastline + 1, + offset_encoding + ) + + -- Grab the changed text of from start_range to curr_end_range in the current buffer. + -- The text range is "" if entire range is deleted. + local text = extract_text(curr_lines, start_range, curr_end_range, line_ending) + + -- Compute the range of the replaced text. Deprecated but still required for certain language servers + local range_length = compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending) + + -- convert to 0 based indexing + local result = { + range = { + ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 }, + ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 }, + }, + text = text, + rangeLength = range_length, + } + + return result +end + +return M |