-- Notes on incremental sync: -- Per the protocol, the text range should be: -- -- A position inside a document (see Position definition below) is expressed as -- a zero-based line and character offset. The offsets are based on a UTF-16 -- string representation. So a string of the form a𐐀b the character offset -- of the character a is 0, the character offset of 𐐀 is 1 and the character -- offset of b is 3 since 𐐀 is represented using two code units in UTF-16. -- -- To ensure that both client and server split the string into the same line -- representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’. -- -- Positions are line end character agnostic. So you can not specify a position that -- denotes \r|\n or \n| where | represents the character offset. This means *no* defining -- a range than ends on the same line after a terminating character -- -- Generic warnings about byte level changes in neovim. Many apparently "single" -- operations in on_lines callbacks are actually multiple operations. -- -- Join operation (2 operations): -- * extends line 1 with the contents of line 2 -- * deletes line 2 -- -- test 1 test 1 test 2 test 1 test 2 -- test 2 -> test 2 -> test 3 -- test 3 test 3 -- -- Deleting (and undoing) two middle lines (1 operation): -- -- test 1 test 1 -- test 2 -> test 4 -- test 3 -- test 4 -- -- Deleting partial lines (5 operations) deleting between asterisks below: -- -- test *1 test * test * test * test *4 test *4* -- test 2 -> test 2 -> test *4 -> *4 -> *4 -> -- test 3 test 3 -- test *4 test 4 local M = {} -- local string.byte, unclear if this is necessary for JIT compilation local str_byte = string.byte local min = math.min local str_utfindex = vim.str_utfindex local str_utf_start = vim.str_utf_start local str_utf_end = vim.str_utf_end ---@private -- Given a line, byte idx, and offset_encoding convert to the -- utf-8, utf-16, or utf-32 index. ---@param line string the line to index into ---@param byte integer the byte idx ---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8) --@returns integer the utf idx for the given encoding local function byte_to_utf(line, byte, offset_encoding) -- convert to 0 based indexing for str_utfindex byte = byte - 1 local utf_idx local _ -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based if offset_encoding == 'utf-16' then _, utf_idx = str_utfindex(line, byte) elseif offset_encoding == 'utf-32' then utf_idx, _ = str_utfindex(line, byte) else utf_idx = byte end -- convert to 1 based indexing return utf_idx + 1 end ---@private local function compute_line_length(line, offset_encoding) local length local _ if offset_encoding == 'utf-16' then _, length = str_utfindex(line) elseif offset_encoding == 'utf-32' then length, _ = str_utfindex(line) else length = #line end return length end ---@private -- Given a line, byte idx, alignment, and offset_encoding convert to the aligned -- utf-8 index and either the utf-16, or utf-32 index. ---@param line string the line to index into ---@param byte integer the byte idx ---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8) ---@returns table byte_idx and char_idx of first change position local function align_end_position(line, byte, offset_encoding) local char -- If on the first byte, or an empty string: the trivial case if byte == 1 or #line == 0 then char = byte -- Called in the case of extending an empty line "" -> "a" elseif byte == #line + 1 then char = compute_line_length(line, offset_encoding) + 1 else -- Modifying line, find the nearest utf codepoint local offset = str_utf_start(line, byte) -- If the byte does not fall on the start of the character, then -- align to the start of the next character. if offset < 0 then byte = byte + str_utf_end(line, byte) + 1 end if byte <= #line then char = byte_to_utf(line, byte, offset_encoding) else char = compute_line_length(line, offset_encoding) + 1 end -- Extending line, find the nearest utf codepoint for the last valid character end return byte, char end ---@private --- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint. ---@param prev_lines table list of lines from previous buffer ---@param curr_lines table list of lines from current buffer ---@param firstline integer firstline from on_lines, adjusted to 1-index ---@param lastline integer lastline from on_lines, adjusted to 1-index ---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index ---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8) ---@returns table line_idx, byte_idx, and char_idx of first change position local function compute_start_range( prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding ) local char_idx local byte_idx -- If firstline == lastline, no existing text is changed. All edit operations -- occur on a new line pointed to by lastline. This occurs during insertion of -- new lines(O), the new newline is inserted at the line indicated by -- new_lastline. if firstline == lastline then local line_idx local line = prev_lines[firstline - 1] if line then line_idx = firstline - 1 byte_idx = #line + 1 char_idx = compute_line_length(line, offset_encoding) + 1 else line_idx = firstline byte_idx = 1 char_idx = 1 end return { line_idx = line_idx, byte_idx = byte_idx, char_idx = char_idx } end -- If firstline == new_lastline, the first change occurred on a line that was deleted. -- In this case, the first byte change is also at the first byte of firstline if firstline == new_lastline then return { line_idx = firstline, byte_idx = 1, char_idx = 1 } end local prev_line = prev_lines[firstline] local curr_line = curr_lines[firstline] -- Iterate across previous and current line containing first change -- to find the first different byte. -- Note: *about -> a*about will register the second a as the first -- difference, regardless of edit since we do not receive the first -- column of the edit from on_lines. local start_byte_idx = 1 for idx = 1, #prev_line + 1 do start_byte_idx = idx if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then break end end -- Convert byte to codepoint if applicable if start_byte_idx == 1 or (#prev_line == 0 and start_byte_idx == 1) then byte_idx = start_byte_idx char_idx = 1 elseif start_byte_idx == #prev_line + 1 then byte_idx = start_byte_idx char_idx = compute_line_length(prev_line, offset_encoding) + 1 else byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx) char_idx = byte_to_utf(prev_line, byte_idx, offset_encoding) end -- Return the start difference (shared for new and prev lines) return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx } end ---@private --- Finds the last line and byte index of the differences between prev and current buffer. --- Normalized to the next codepoint. --- prev_end_range is the text range sent to the server representing the changed region. --- curr_end_range is the text that should be collected and sent to the server. -- ---@param prev_lines table list of lines ---@param curr_lines table list of lines ---@param start_range table ---@param lastline integer ---@param new_lastline integer ---@param offset_encoding string ---@returns (int, int) end_line_idx and end_col_idx of range local function compute_end_range( prev_lines, curr_lines, start_range, firstline, lastline, new_lastline, offset_encoding ) -- If firstline == new_lastline, the first change occurred on a line that was deleted. -- In this case, the last_byte... if firstline == new_lastline then return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, { line_idx = firstline, byte_idx = 1, char_idx = 1, } end if firstline == lastline then return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, { line_idx = new_lastline - lastline + firstline, byte_idx = 1, char_idx = 1, } end -- Compare on last line, at minimum will be the start range local start_line_idx = start_range.line_idx -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines local prev_line_idx = lastline - 1 local curr_line_idx = new_lastline - 1 local prev_line = prev_lines[lastline - 1] local curr_line = curr_lines[new_lastline - 1] local prev_line_length = #prev_line local curr_line_length = #curr_line local byte_offset = 0 -- Editing the same line -- If the byte offset is zero, that means there is a difference on the last byte (not newline) if prev_line_idx == curr_line_idx then local max_length if start_line_idx == prev_line_idx then -- Search until beginning of difference max_length = min( prev_line_length - start_range.byte_idx, curr_line_length - start_range.byte_idx ) + 1 else max_length = min(prev_line_length, curr_line_length) + 1 end for idx = 0, max_length do byte_offset = idx if str_byte(prev_line, prev_line_length - byte_offset) ~= str_byte(curr_line, curr_line_length - byte_offset) then break end end end -- Iterate from end to beginning of shortest line local prev_end_byte_idx = prev_line_length - byte_offset + 1 -- Handle case where lines match if prev_end_byte_idx == 0 then prev_end_byte_idx = 1 end local prev_byte_idx, prev_char_idx = align_end_position(prev_line, prev_end_byte_idx, offset_encoding) local prev_end_range = { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx } local curr_end_range -- Deletion event, new_range cannot be before start if curr_line_idx < start_line_idx then curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 } else local curr_end_byte_idx = curr_line_length - byte_offset + 1 -- Handle case where lines match if curr_end_byte_idx == 0 then curr_end_byte_idx = 1 end local curr_byte_idx, curr_char_idx = align_end_position(curr_line, curr_end_byte_idx, offset_encoding) curr_end_range = { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx } end return prev_end_range, curr_end_range end ---@private --- Get the text of the range defined by start and end line/column ---@param lines table list of lines ---@param start_range table table returned by first_difference ---@param end_range table new_end_range returned by last_difference ---@returns string text extracted from defined region local function extract_text(lines, start_range, end_range, line_ending) if not lines[start_range.line_idx] then return '' end -- Trivial case: start and end range are the same line, directly grab changed text if start_range.line_idx == end_range.line_idx then -- string.sub is inclusive, end_range is not return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1) else -- Handle deletion case -- Collect the changed portion of the first changed line local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) } -- Collect the full line for intermediate lines for idx = start_range.line_idx + 1, end_range.line_idx - 1 do table.insert(result, lines[idx]) end if lines[end_range.line_idx] then -- Collect the changed portion of the last changed line. table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1)) else table.insert(result, '') end -- Add line ending between all lines return table.concat(result, line_ending) end end ---@private -- rangelength depends on the offset encoding -- bytes for utf-8 (clangd with extension) -- codepoints for utf-16 -- codeunits for utf-32 -- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac) -- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior) local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending) local line_ending_length = #line_ending -- Single line case if start_range.line_idx == end_range.line_idx then return end_range.char_idx - start_range.char_idx end local start_line = lines[start_range.line_idx] local range_length if start_line and #start_line > 0 then range_length = compute_line_length(start_line, offset_encoding) - start_range.char_idx + 1 + line_ending_length else -- Length of newline character range_length = line_ending_length end -- The first and last range of the line idx may be partial lines for idx = start_range.line_idx + 1, end_range.line_idx - 1 do -- Length full line plus newline character if #lines[idx] > 0 then range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending else range_length = range_length + line_ending_length end end local end_line = lines[end_range.line_idx] if end_line and #end_line > 0 then range_length = range_length + end_range.char_idx - 1 end return range_length end --- Returns the range table for the difference between prev and curr lines ---@param prev_lines table list of lines ---@param curr_lines table list of lines ---@param firstline number line to begin search for first difference ---@param lastline number line to begin search in old_lines for last difference ---@param new_lastline number line to begin search in new_lines for last difference ---@param offset_encoding string encoding requested by language server ---@returns table TextDocumentContentChangeEvent see https://microsoft.github.io/language-server-protocol/specifications/specification-3-17/#textDocumentContentChangeEvent function M.compute_diff( prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding, line_ending ) -- Find the start of changes between the previous and current buffer. Common between both. -- Sent to the server as the start of the changed range. -- Used to grab the changed text from the latest buffer. local start_range = compute_start_range( prev_lines, curr_lines, firstline + 1, lastline + 1, new_lastline + 1, offset_encoding ) -- Find the last position changed in the previous and current buffer. -- prev_end_range is sent to the server as as the end of the changed range. -- curr_end_range is used to grab the changed text from the latest buffer. local prev_end_range, curr_end_range = compute_end_range( prev_lines, curr_lines, start_range, firstline + 1, lastline + 1, new_lastline + 1, offset_encoding ) -- Grab the changed text of from start_range to curr_end_range in the current buffer. -- The text range is "" if entire range is deleted. local text = extract_text(curr_lines, start_range, curr_end_range, line_ending) -- Compute the range of the replaced text. Deprecated but still required for certain language servers local range_length = compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending) -- convert to 0 based indexing local result = { range = { ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 }, ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 }, }, text = text, rangeLength = range_length, } return result end return M