fix(lsp): rewrite incremental sync (#16252)

* use codeunits/points instead of byte ranges when applicable * take into account different file formats when computing range and sending text (dos, unix, and mac supported) * add tests of incremental sync
author: Michael Lingelbach <m.j.lbach@gmail.com> 2021-11-09 14:37:48 -0800
committer: GitHub <noreply@github.com> 2021-11-09 14:37:48 -0800
commit: 2ecf0a4c6183bba1c65d440711038f040d355fef (patch)
tree: 7774b0cebcf60039cf0232723a65840e784e7af2 /runtime/lua/vim/lsp/sync.lua
parent: 953ae71fd324eb1a263d2b7435cc15756b44ac2d (diff)
download: rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.tar.gz
rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.tar.bz2
rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.zip
1 files changed, 381 insertions, 0 deletions
diff --git a/runtime/lua/vim/lsp/sync.lua b/runtime/lua/vim/lsp/sync.lua
new file mode 100644
index 0000000000..37247c61b9
--- /dev/null
+++ b/runtime/lua/vim/lsp/sync.lua
@@ -0,0 +1,381 @@
+-- Notes on incremental sync:
+--  Per the protocol, the text range should be:
+--
+--  A position inside a document (see Position definition below) is expressed as
+--  a zero-based line and character offset. The offsets are based on a UTF-16
+--  string representation. So a string of the form a𐐀b the character offset
+--  of the character a is 0, the character offset of 𐐀 is 1 and the character
+--  offset of b is 3 since 𐐀 is represented using two code units in UTF-16.
+--
+--  To ensure that both client and server split the string into the same line
+--  representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’.
+--
+--  Positions are line end character agnostic. So you can not specify a position that
+--  denotes \r|\n or \n| where | represents the character offset. This means *no* defining
+--  a range than ends on the same line after a terminating character
+--
+-- Generic warnings about byte level changes in neovim. Many apparently "single"
+-- operations in on_lines callbacks are actually multiple operations.
+--
+--  Join operation (2 operations):
+--  * extends line 1 with the contents of line 2
+--  * deletes line 2
+--
+--  test 1    test 1 test 2    test 1 test 2
+--  test 2 -> test 2        -> test 3
+--  test 3    test 3
+--
+--  Deleting (and undoing) two middle lines (1 operation):
+--
+--  test 1    test 1
+--  test 2 -> test 4
+--  test 3
+--  test 4
+--
+--  Deleting partial lines (5 operations) deleting between asterisks below:
+--
+--  test *1   test *    test *     test *    test *4    test *4*
+--  test 2 -> test 2 -> test *4 -> *4     -> *4      ->
+--  test 3    test 3
+--  test *4   test 4
+
+local M = {}
+
+-- local string.byte, unclear if this is necessary for JIT compilation
+local str_byte = string.byte
+local min = math.min
+local str_utfindex = vim.str_utfindex
+local str_utf_start = vim.str_utf_start
+local str_utf_end = vim.str_utf_end
+
+---@private
+-- Given a line, byte idx, and offset_encoding convert to the
+-- utf-8, utf-16, or utf-32 index.
+---@param line string the line to index into
+---@param byte integer the byte idx
+---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
+--@returns integer the utf idx for the given encoding
+local function byte_to_utf(line, byte, offset_encoding)
+  -- convert to 0 based indexing for str_utfindex
+  byte = byte - 1
+
+  local utf_idx
+  local _
+  -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based
+  if offset_encoding == 'utf-16' then
+    _, utf_idx = str_utfindex(line, byte)
+  elseif offset_encoding == 'utf-32' then
+    utf_idx, _ = str_utfindex(line, byte)
+  else
+    utf_idx = byte
+  end
+
+  -- convert to 1 based indexing
+  return utf_idx + 1
+end
+
+---@private
+-- Given a line, byte idx, alignment, and offset_encoding convert to the aligned
+-- utf-8 index and either the utf-16, or utf-32 index.
+---@param line string the line to index into
+---@param byte integer the byte idx
+---@param align string when dealing with multibyte characters,
+--        to choose the start of the current character or the beginning of the next.
+--        Used for incremental sync for start/end range respectively
+---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
+---@returns table<string, int> byte_idx and char_idx of first change position
+local function align_position(line, byte, align, offset_encoding)
+  local char
+  -- If on the first byte, or an empty string: the trivial case
+  if byte == 1 or #line == 0 then
+    char = byte
+  -- Called in the case of extending an empty line "" -> "a"
+  elseif byte == #line + 1 then
+    byte = byte + str_utf_end(line, #line)
+    char = byte_to_utf(line, byte, offset_encoding)
+  else
+    -- Modifying line, find the nearest utf codepoint
+    if align == 'start' then
+      byte = byte + str_utf_start(line, byte)
+      char = byte_to_utf(line, byte, offset_encoding)
+    elseif align == 'end' then
+      local offset = str_utf_end(line, byte)
+      -- If the byte does not fall on the start of the character, then
+      -- align to the start of the next character.
+      if offset > 0 then
+        char = byte_to_utf(line, byte, offset_encoding) + 1
+        byte = byte + offset
+      else
+        char = byte_to_utf(line, byte, offset_encoding)
+        byte = byte + offset
+      end
+    else
+      error('`align` must be start or end.')
+    end
+    -- Extending line, find the nearest utf codepoint for the last valid character
+  end
+  return byte, char
+end
+
+---@private
+--- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint.
+---@param prev_lines table list of lines from previous buffer
+---@param curr_lines table list of lines from current buffer
+---@param firstline integer firstline from on_lines, adjusted to 1-index
+---@param lastline integer lastline from on_lines, adjusted to 1-index
+---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index
+---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8)
+---@returns table<int, int> line_idx, byte_idx, and char_idx of first change position
+local function compute_start_range(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding)
+  -- If firstline == lastline, no existing text is changed. All edit operations
+  -- occur on a new line pointed to by lastline. This occurs during insertion of
+  -- new lines(O), the new newline is inserted at the line indicated by
+  -- new_lastline.
+  -- If firstline == new_lastline, the first change occured on a line that was deleted.
+  -- In this case, the first byte change is also at the first byte of firstline
+  if firstline == new_lastline or firstline == lastline then
+    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }
+  end
+
+  local prev_line = prev_lines[firstline]
+  local curr_line = curr_lines[firstline]
+
+  -- Iterate across previous and current line containing first change
+  -- to find the first different byte.
+  -- Note: *about -> a*about will register the second a as the first
+  -- difference, regardless of edit since we do not receive the first
+  -- column of the edit from on_lines.
+  local start_byte_idx = 1
+  for idx = 1, #prev_line + 1 do
+    start_byte_idx = idx
+    if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then
+      break
+    end
+  end
+
+  -- Convert byte to codepoint if applicable
+  local byte_idx, char_idx = align_position(prev_line, start_byte_idx, 'start', offset_encoding)
+
+  -- Return the start difference (shared for new and prev lines)
+  return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx }
+end
+
+---@private
+--- Finds the last line and byte index of the differences between prev and current buffer.
+--- Normalized to the next codepoint.
+--- prev_end_range is the text range sent to the server representing the changed region.
+--- curr_end_range is the text that should be collected and sent to the server.
+--
+---@param prev_lines table list of lines
+---@param curr_lines table list of lines
+---@param start_range table
+---@param lastline integer
+---@param new_lastline integer
+---@param offset_encoding string
+---@returns (int, int) end_line_idx and end_col_idx of range
+local function compute_end_range(prev_lines, curr_lines, start_range, firstline, lastline, new_lastline, offset_encoding)
+  -- If firstline == new_lastline, the first change occured on a line that was deleted.
+  -- In this case, the last_byte...
+  if firstline == new_lastline then
+      return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, { line_idx = firstline, byte_idx = 1, char_idx = 1 }
+  end
+  if firstline == lastline then
+      return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, { line_idx = new_lastline - lastline + firstline, byte_idx = 1, char_idx = 1 }
+  end
+  -- Compare on last line, at minimum will be the start range
+  local start_line_idx = start_range.line_idx
+
+  -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines
+  local prev_line_idx = lastline - 1
+  local curr_line_idx = new_lastline - 1
+
+  local prev_line = prev_lines[lastline - 1]
+  local curr_line = curr_lines[new_lastline - 1]
+
+  local prev_line_length = #prev_line
+  local curr_line_length = #curr_line
+
+  local byte_offset = 0
+
+  -- Editing the same line
+  -- If the byte offset is zero, that means there is a difference on the last byte (not newline)
+  if prev_line_idx == curr_line_idx then
+    local max_length
+    if start_line_idx == prev_line_idx then
+      -- Search until beginning of difference
+      max_length = min(prev_line_length - start_range.byte_idx, curr_line_length - start_range.byte_idx) + 1
+    else
+      max_length = min(prev_line_length, curr_line_length) + 1
+    end
+    for idx = 0, max_length do
+      byte_offset = idx
+      if
+        str_byte(prev_line, prev_line_length - byte_offset) ~= str_byte(curr_line, curr_line_length - byte_offset)
+      then
+        break
+      end
+    end
+  end
+
+  -- Iterate from end to beginning of shortest line
+  local prev_end_byte_idx = prev_line_length - byte_offset + 1
+  local prev_byte_idx, prev_char_idx = align_position(prev_line, prev_end_byte_idx, 'start', offset_encoding)
+  local prev_end_range = { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx }
+
+  local curr_end_range
+  -- Deletion event, new_range cannot be before start
+  if curr_line_idx < start_line_idx then
+    curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 }
+  else
+    local curr_end_byte_idx = curr_line_length - byte_offset + 1
+    local curr_byte_idx, curr_char_idx = align_position(curr_line, curr_end_byte_idx, 'start', offset_encoding)
+    curr_end_range = { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
+  end
+
+  return prev_end_range, curr_end_range
+end
+
+---@private
+--- Get the text of the range defined by start and end line/column
+---@param lines table list of lines
+---@param start_range table table returned by first_difference
+---@param end_range table new_end_range returned by last_difference
+---@returns string text extracted from defined region
+local function extract_text(lines, start_range, end_range, line_ending)
+    if not lines[start_range.line_idx] then
+      return ""
+    end
+  -- Trivial case: start and end range are the same line, directly grab changed text
+  if start_range.line_idx == end_range.line_idx then
+    -- string.sub is inclusive, end_range is not
+    return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1)
+
+  else
+    -- Handle deletion case
+    -- Collect the changed portion of the first changed line
+    local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) }
+
+    -- Collect the full line for intermediate lines
+    for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
+      table.insert(result, lines[idx])
+    end
+
+    if lines[end_range.line_idx] then
+      -- Collect the changed portion of the last changed line.
+      table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1))
+    else
+      table.insert(result, "")
+    end
+
+    -- Add line ending between all lines
+    return table.concat(result, line_ending)
+  end
+end
+
+local function compute_line_length(line, offset_encoding)
+  local length
+  local _
+  if offset_encoding == 'utf-16' then
+     _, length = str_utfindex(line)
+  elseif offset_encoding == 'utf-32' then
+    length, _ = str_utfindex(line)
+  else
+    length = #line
+  end
+  return length
+end
+---@private
+-- rangelength depends on the offset encoding
+-- bytes for utf-8 (clangd with extenion)
+-- codepoints for utf-16
+-- codeunits for utf-32
+-- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac)
+-- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior)
+local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending)
+  local line_ending_length = #line_ending
+  -- Single line case
+  if start_range.line_idx == end_range.line_idx then
+    return end_range.char_idx - start_range.char_idx
+  end
+
+  local start_line = lines[start_range.line_idx]
+  local range_length
+  if start_line and #start_line > 0 then
+    range_length = compute_line_length(start_line, offset_encoding) - start_range.char_idx + 1 + line_ending_length
+  else
+    -- Length of newline character
+    range_length = line_ending_length
+  end
+
+  -- The first and last range of the line idx may be partial lines
+  for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
+    -- Length full line plus newline character
+    if #lines[idx] > 0 then
+      range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending
+    else
+      range_length = range_length + line_ending_length
+    end
+  end
+
+  local end_line = lines[end_range.line_idx]
+  if end_line and #end_line > 0 then
+    range_length = range_length + end_range.char_idx - 1
+  end
+
+  return range_length
+end
+
+--- Returns the range table for the difference between prev and curr lines
+---@param prev_lines table list of lines
+---@param curr_lines table list of lines
+---@param firstline number line to begin search for first difference
+---@param lastline number line to begin search in old_lines for last difference
+---@param new_lastline number line to begin search in new_lines for last difference
+---@param offset_encoding string encoding requested by language server
+---@returns table TextDocumentContentChangeEvent see https://microsoft.github.io/language-server-protocol/specifications/specification-3-17/#textDocumentContentChangeEvent
+function M.compute_diff(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding, line_ending)
+  -- Find the start of changes between the previous and current buffer. Common between both.
+  -- Sent to the server as the start of the changed range.
+  -- Used to grab the changed text from the latest buffer.
+  local start_range = compute_start_range(
+    prev_lines,
+    curr_lines,
+    firstline + 1,
+    lastline + 1,
+    new_lastline + 1,
+    offset_encoding
+  )
+  -- Find the last position changed in the previous and current buffer.
+  -- prev_end_range is sent to the server as as the end of the changed range.
+  -- curr_end_range is used to grab the changed text from the latest buffer.
+  local prev_end_range, curr_end_range = compute_end_range(
+    prev_lines,
+    curr_lines,
+    start_range,
+    firstline + 1,
+    lastline + 1,
+    new_lastline + 1,
+    offset_encoding
+  )
+
+  -- Grab the changed text of from start_range to curr_end_range in the current buffer.
+  -- The text range is "" if entire range is deleted.
+  local text = extract_text(curr_lines, start_range, curr_end_range, line_ending)
+
+  -- Compute the range of the replaced text. Deprecated but still required for certain language servers
+  local range_length = compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending)
+
+  -- convert to 0 based indexing
+  local result = {
+    range = {
+      ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 },
+      ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 },
+    },
+    text = text,
+    rangeLength = range_length,
+  }
+
+  return result
+end
+
+return M
author	Michael Lingelbach <m.j.lbach@gmail.com>	2021-11-09 14:37:48 -0800
committer	GitHub <noreply@github.com>	2021-11-09 14:37:48 -0800
commit	2ecf0a4c6183bba1c65d440711038f040d355fef (patch)
tree	7774b0cebcf60039cf0232723a65840e784e7af2 /runtime/lua/vim/lsp/sync.lua
parent	953ae71fd324eb1a263d2b7435cc15756b44ac2d (diff)
download	rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.tar.gz rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.tar.bz2 rneovim-2ecf0a4c6183bba1c65d440711038f040d355fef.zip