| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
 | -- Notes on incremental sync:
--  Per the protocol, the text range should be:
--
--  A position inside a document (see Position definition below) is expressed as
--  a zero-based line and character offset. The offsets are based on a UTF-16
--  string representation. So a string of the form a𐐀b the character offset
--  of the character a is 0, the character offset of 𐐀 is 1 and the character
--  offset of b is 3 since 𐐀 is represented using two code units in UTF-16.
--
--  To ensure that both client and server split the string into the same line
--  representation the protocol specifies the following end-of-line sequences: ‘\n’, ‘\r\n’ and ‘\r’.
--
--  Positions are line end character agnostic. So you can not specify a position that
--  denotes \r|\n or \n| where | represents the character offset. This means *no* defining
--  a range than ends on the same line after a terminating character
--
-- Generic warnings about byte level changes in neovim. Many apparently "single"
-- operations in on_lines callbacks are actually multiple operations.
--
--  Join operation (2 operations):
--  * extends line 1 with the contents of line 2
--  * deletes line 2
--
--  test 1    test 1 test 2    test 1 test 2
--  test 2 -> test 2        -> test 3
--  test 3    test 3
--
--  Deleting (and undoing) two middle lines (1 operation):
--
--  test 1    test 1
--  test 2 -> test 4
--  test 3
--  test 4
--
--  Deleting partial lines (5 operations) deleting between asterisks below:
--
--  test *1   test *    test *     test *    test *4    test *4*
--  test 2 -> test 2 -> test *4 -> *4     -> *4      ->
--  test 3    test 3
--  test *4   test 4
local M = {}
-- local string.byte, unclear if this is necessary for JIT compilation
local str_byte = string.byte
local min = math.min
local str_utfindex = vim.str_utfindex
local str_utf_start = vim.str_utf_start
local str_utf_end = vim.str_utf_end
---@private
-- Given a line, byte idx, and offset_encoding convert to the
-- utf-8, utf-16, or utf-32 index.
---@param line string the line to index into
---@param byte integer the byte idx
---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
--@returns integer the utf idx for the given encoding
local function byte_to_utf(line, byte, offset_encoding)
  -- convert to 0 based indexing for str_utfindex
  byte = byte - 1
  local utf_idx
  local _
  -- Convert the byte range to utf-{8,16,32} and convert 1-based (lua) indexing to 0-based
  if offset_encoding == 'utf-16' then
    _, utf_idx = str_utfindex(line, byte)
  elseif offset_encoding == 'utf-32' then
    utf_idx, _ = str_utfindex(line, byte)
  else
    utf_idx = byte
  end
  -- convert to 1 based indexing
  return utf_idx + 1
end
---@private
local function compute_line_length(line, offset_encoding)
  local length
  local _
  if offset_encoding == 'utf-16' then
    _, length = str_utfindex(line)
  elseif offset_encoding == 'utf-32' then
    length, _ = str_utfindex(line)
  else
    length = #line
  end
  return length
end
---@private
-- Given a line, byte idx, alignment, and offset_encoding convert to the aligned
-- utf-8 index and either the utf-16, or utf-32 index.
---@param line string the line to index into
---@param byte integer the byte idx
---@param offset_encoding string utf-8|utf-16|utf-32|nil (default: utf-8)
---@returns table<string, int> byte_idx and char_idx of first change position
local function align_end_position(line, byte, offset_encoding)
  local char
  -- If on the first byte, or an empty string: the trivial case
  if byte == 1 or #line == 0 then
    char = byte
    -- Called in the case of extending an empty line "" -> "a"
  elseif byte == #line + 1 then
    char = compute_line_length(line, offset_encoding) + 1
  else
    -- Modifying line, find the nearest utf codepoint
    local offset = str_utf_start(line, byte)
    -- If the byte does not fall on the start of the character, then
    -- align to the start of the next character.
    if offset < 0 then
      byte = byte + str_utf_end(line, byte) + 1
    end
    if byte <= #line then
      char = byte_to_utf(line, byte, offset_encoding)
    else
      char = compute_line_length(line, offset_encoding) + 1
    end
    -- Extending line, find the nearest utf codepoint for the last valid character
  end
  return byte, char
end
---@private
--- Finds the first line, byte, and char index of the difference between the previous and current lines buffer normalized to the previous codepoint.
---@param prev_lines table list of lines from previous buffer
---@param curr_lines table list of lines from current buffer
---@param firstline integer firstline from on_lines, adjusted to 1-index
---@param lastline integer lastline from on_lines, adjusted to 1-index
---@param new_lastline integer new_lastline from on_lines, adjusted to 1-index
---@param offset_encoding string utf-8|utf-16|utf-32|nil (fallback to utf-8)
---@returns table<int, int> line_idx, byte_idx, and char_idx of first change position
local function compute_start_range(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding)
  local char_idx
  local byte_idx
  -- If firstline == lastline, no existing text is changed. All edit operations
  -- occur on a new line pointed to by lastline. This occurs during insertion of
  -- new lines(O), the new newline is inserted at the line indicated by
  -- new_lastline.
  if firstline == lastline then
    local line_idx
    local line = prev_lines[firstline - 1]
    if line then
      line_idx = firstline - 1
      byte_idx = #line + 1
      char_idx = compute_line_length(line, offset_encoding) + 1
    else
      line_idx = firstline
      byte_idx = 1
      char_idx = 1
    end
    return { line_idx = line_idx, byte_idx = byte_idx, char_idx = char_idx }
  end
  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  -- In this case, the first byte change is also at the first byte of firstline
  if firstline == new_lastline then
    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }
  end
  local prev_line = prev_lines[firstline]
  local curr_line = curr_lines[firstline]
  -- Iterate across previous and current line containing first change
  -- to find the first different byte.
  -- Note: *about -> a*about will register the second a as the first
  -- difference, regardless of edit since we do not receive the first
  -- column of the edit from on_lines.
  local start_byte_idx = 1
  for idx = 1, #prev_line + 1 do
    start_byte_idx = idx
    if str_byte(prev_line, idx) ~= str_byte(curr_line, idx) then
      break
    end
  end
  -- Convert byte to codepoint if applicable
  if start_byte_idx == 1 or (#prev_line == 0 and start_byte_idx == 1) then
    byte_idx = start_byte_idx
    char_idx = 1
  elseif start_byte_idx == #prev_line + 1 then
    byte_idx = start_byte_idx
    char_idx = compute_line_length(prev_line, offset_encoding) + 1
  else
    byte_idx = start_byte_idx + str_utf_start(prev_line, start_byte_idx)
    char_idx = byte_to_utf(prev_line, byte_idx, offset_encoding)
  end
  -- Return the start difference (shared for new and prev lines)
  return { line_idx = firstline, byte_idx = byte_idx, char_idx = char_idx }
end
---@private
--- Finds the last line and byte index of the differences between prev and current buffer.
--- Normalized to the next codepoint.
--- prev_end_range is the text range sent to the server representing the changed region.
--- curr_end_range is the text that should be collected and sent to the server.
--
---@param prev_lines table list of lines
---@param curr_lines table list of lines
---@param start_range table
---@param lastline integer
---@param new_lastline integer
---@param offset_encoding string
---@returns (int, int) end_line_idx and end_col_idx of range
local function compute_end_range(
  prev_lines,
  curr_lines,
  start_range,
  firstline,
  lastline,
  new_lastline,
  offset_encoding
)
  -- If firstline == new_lastline, the first change occurred on a line that was deleted.
  -- In this case, the last_byte...
  if firstline == new_lastline then
    return { line_idx = (lastline - new_lastline + firstline), byte_idx = 1, char_idx = 1 }, {
      line_idx = firstline,
      byte_idx = 1,
      char_idx = 1,
    }
  end
  if firstline == lastline then
    return { line_idx = firstline, byte_idx = 1, char_idx = 1 }, {
      line_idx = new_lastline - lastline + firstline,
      byte_idx = 1,
      char_idx = 1,
    }
  end
  -- Compare on last line, at minimum will be the start range
  local start_line_idx = start_range.line_idx
  -- lastline and new_lastline were last lines that were *not* replaced, compare previous lines
  local prev_line_idx = lastline - 1
  local curr_line_idx = new_lastline - 1
  local prev_line = prev_lines[lastline - 1]
  local curr_line = curr_lines[new_lastline - 1]
  local prev_line_length = #prev_line
  local curr_line_length = #curr_line
  local byte_offset = 0
  -- Editing the same line
  -- If the byte offset is zero, that means there is a difference on the last byte (not newline)
  if prev_line_idx == curr_line_idx then
    local max_length
    if start_line_idx == prev_line_idx then
      -- Search until beginning of difference
      max_length = min(prev_line_length - start_range.byte_idx, curr_line_length - start_range.byte_idx) + 1
    else
      max_length = min(prev_line_length, curr_line_length) + 1
    end
    for idx = 0, max_length do
      byte_offset = idx
      if str_byte(prev_line, prev_line_length - byte_offset) ~= str_byte(curr_line, curr_line_length - byte_offset) then
        break
      end
    end
  end
  -- Iterate from end to beginning of shortest line
  local prev_end_byte_idx = prev_line_length - byte_offset + 1
  -- Handle case where lines match
  if prev_end_byte_idx == 0 then
    prev_end_byte_idx = 1
  end
  local prev_byte_idx, prev_char_idx = align_end_position(prev_line, prev_end_byte_idx, offset_encoding)
  local prev_end_range = { line_idx = prev_line_idx, byte_idx = prev_byte_idx, char_idx = prev_char_idx }
  local curr_end_range
  -- Deletion event, new_range cannot be before start
  if curr_line_idx < start_line_idx then
    curr_end_range = { line_idx = start_line_idx, byte_idx = 1, char_idx = 1 }
  else
    local curr_end_byte_idx = curr_line_length - byte_offset + 1
    -- Handle case where lines match
    if curr_end_byte_idx == 0 then
      curr_end_byte_idx = 1
    end
    local curr_byte_idx, curr_char_idx = align_end_position(curr_line, curr_end_byte_idx, offset_encoding)
    curr_end_range = { line_idx = curr_line_idx, byte_idx = curr_byte_idx, char_idx = curr_char_idx }
  end
  return prev_end_range, curr_end_range
end
---@private
--- Get the text of the range defined by start and end line/column
---@param lines table list of lines
---@param start_range table table returned by first_difference
---@param end_range table new_end_range returned by last_difference
---@returns string text extracted from defined region
local function extract_text(lines, start_range, end_range, line_ending)
  if not lines[start_range.line_idx] then
    return ''
  end
  -- Trivial case: start and end range are the same line, directly grab changed text
  if start_range.line_idx == end_range.line_idx then
    -- string.sub is inclusive, end_range is not
    return string.sub(lines[start_range.line_idx], start_range.byte_idx, end_range.byte_idx - 1)
  else
    -- Handle deletion case
    -- Collect the changed portion of the first changed line
    local result = { string.sub(lines[start_range.line_idx], start_range.byte_idx) }
    -- Collect the full line for intermediate lines
    for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
      table.insert(result, lines[idx])
    end
    if lines[end_range.line_idx] then
      -- Collect the changed portion of the last changed line.
      table.insert(result, string.sub(lines[end_range.line_idx], 1, end_range.byte_idx - 1))
    else
      table.insert(result, '')
    end
    -- Add line ending between all lines
    return table.concat(result, line_ending)
  end
end
---@private
-- rangelength depends on the offset encoding
-- bytes for utf-8 (clangd with extension)
-- codepoints for utf-16
-- codeunits for utf-32
-- Line endings count here as 2 chars for \r\n (dos), 1 char for \n (unix), and 1 char for \r (mac)
-- These correspond to Windows, Linux/macOS (OSX and newer), and macOS (version 9 and prior)
local function compute_range_length(lines, start_range, end_range, offset_encoding, line_ending)
  local line_ending_length = #line_ending
  -- Single line case
  if start_range.line_idx == end_range.line_idx then
    return end_range.char_idx - start_range.char_idx
  end
  local start_line = lines[start_range.line_idx]
  local range_length
  if start_line and #start_line > 0 then
    range_length = compute_line_length(start_line, offset_encoding) - start_range.char_idx + 1 + line_ending_length
  else
    -- Length of newline character
    range_length = line_ending_length
  end
  -- The first and last range of the line idx may be partial lines
  for idx = start_range.line_idx + 1, end_range.line_idx - 1 do
    -- Length full line plus newline character
    if #lines[idx] > 0 then
      range_length = range_length + compute_line_length(lines[idx], offset_encoding) + #line_ending
    else
      range_length = range_length + line_ending_length
    end
  end
  local end_line = lines[end_range.line_idx]
  if end_line and #end_line > 0 then
    range_length = range_length + end_range.char_idx - 1
  end
  return range_length
end
--- Returns the range table for the difference between prev and curr lines
---@param prev_lines table list of lines
---@param curr_lines table list of lines
---@param firstline number line to begin search for first difference
---@param lastline number line to begin search in old_lines for last difference
---@param new_lastline number line to begin search in new_lines for last difference
---@param offset_encoding string encoding requested by language server
---@returns table TextDocumentContentChangeEvent see https://microsoft.github.io/language-server-protocol/specifications/specification-3-17/#textDocumentContentChangeEvent
function M.compute_diff(prev_lines, curr_lines, firstline, lastline, new_lastline, offset_encoding, line_ending)
  -- Find the start of changes between the previous and current buffer. Common between both.
  -- Sent to the server as the start of the changed range.
  -- Used to grab the changed text from the latest buffer.
  local start_range = compute_start_range(
    prev_lines,
    curr_lines,
    firstline + 1,
    lastline + 1,
    new_lastline + 1,
    offset_encoding
  )
  -- Find the last position changed in the previous and current buffer.
  -- prev_end_range is sent to the server as as the end of the changed range.
  -- curr_end_range is used to grab the changed text from the latest buffer.
  local prev_end_range, curr_end_range = compute_end_range(
    prev_lines,
    curr_lines,
    start_range,
    firstline + 1,
    lastline + 1,
    new_lastline + 1,
    offset_encoding
  )
  -- Grab the changed text of from start_range to curr_end_range in the current buffer.
  -- The text range is "" if entire range is deleted.
  local text = extract_text(curr_lines, start_range, curr_end_range, line_ending)
  -- Compute the range of the replaced text. Deprecated but still required for certain language servers
  local range_length = compute_range_length(prev_lines, start_range, prev_end_range, offset_encoding, line_ending)
  -- convert to 0 based indexing
  local result = {
    range = {
      ['start'] = { line = start_range.line_idx - 1, character = start_range.char_idx - 1 },
      ['end'] = { line = prev_end_range.line_idx - 1, character = prev_end_range.char_idx - 1 },
    },
    text = text,
    rangeLength = range_length,
  }
  return result
end
return M
 |