fix(diff): use mmfile_t in linematch

Problem: Linematch used to use strchr to navigate a string, however strchr does not supoprt embedded NULs. Solution: Use `mmfile_t` instead of `char *` in linematch and introduce `strnchr()`. Also remove heap allocations from `matching_char_iwhite()` Fixes: #30505
author: Lewis Russell <lewis6991@gmail.com> 2024-09-26 16:10:11 +0100
committer: Lewis Russell <lewis6991@gmail.com> 2024-09-30 11:51:33 +0100
commit: c65646c2474d22948c604168a68f6626a645d1d2 (patch)
tree: fcf8e49ae06878638e0dacc34e08d7892ccc8524
parent: 20251be15a4ad3f6e7016450ca3338d52b2f0951 (diff)
download: rneovim-c65646c2474d22948c604168a68f6626a645d1d2.tar.gz
rneovim-c65646c2474d22948c604168a68f6626a645d1d2.tar.bz2
rneovim-c65646c2474d22948c604168a68f6626a645d1d2.zip
9 files changed, 107 insertions, 78 deletions
diff --git a/runtime/doc/lua.txt b/runtime/doc/lua.txt
index 29b35d69a0..cbe283065d 100644
--- a/runtime/doc/lua.txt
+++ b/runtime/doc/lua.txt
@@ -713,8 +713,8 @@ vim.diff({a}, {b}, {opts})                                        *vim.diff()*
     Parameters: ~
       • {a}     (`string`) First string to compare
       • {b}     (`string`) Second string to compare
-      • {opts}  (`table`) Optional parameters:
-                • {on_hunk}
+      • {opts}  (`table?`) Optional parameters:
+                • {on_hunk}?
                   (`fun(start_a: integer, count_a: integer, start_b: integer, count_b: integer): integer`)
                   Invoked for each hunk in the diff. Return a negative number
                   to cancel the callback for any remaining hunks. Arguments:
@@ -722,33 +722,33 @@ vim.diff({a}, {b}, {opts})                                        *vim.diff()*
                   • `count_a` (`integer`): Hunk size in {a}.
                   • `start_b` (`integer`): Start line of hunk in {b}.
                   • `count_b` (`integer`): Hunk size in {b}.
-                • {result_type} (`'unified'|'indices'`, default: `'unified'`)
+                • {result_type}? (`'unified'|'indices'`, default: `'unified'`)
                   Form of the returned diff:
                   • `unified`: String in unified format.
                   • `indices`: Array of hunk locations. Note: This option is
                     ignored if `on_hunk` is used.
-                • {linematch} (`boolean|integer`) Run linematch on the
+                • {linematch}? (`boolean|integer`) Run linematch on the
                   resulting hunks from xdiff. When integer, only hunks upto
                   this size in lines are run through linematch. Requires
                   `result_type = indices`, ignored otherwise.
-                • {algorithm} (`'myers'|'minimal'|'patience'|'histogram'`,
+                • {algorithm}? (`'myers'|'minimal'|'patience'|'histogram'`,
                   default: `'myers'`) Diff algorithm to use. Values:
                   • `myers`: the default algorithm
                   • `minimal`: spend extra time to generate the smallest
                     possible diff
                   • `patience`: patience diff algorithm
                   • `histogram`: histogram diff algorithm
-                • {ctxlen} (`integer`) Context length
-                • {interhunkctxlen} (`integer`) Inter hunk context length
-                • {ignore_whitespace} (`boolean`) Ignore whitespace
-                • {ignore_whitespace_change} (`boolean`) Ignore whitespace
+                • {ctxlen}? (`integer`) Context length
+                • {interhunkctxlen}? (`integer`) Inter hunk context length
+                • {ignore_whitespace}? (`boolean`) Ignore whitespace
+                • {ignore_whitespace_change}? (`boolean`) Ignore whitespace
                   change
-                • {ignore_whitespace_change_at_eol} (`boolean`) Ignore
+                • {ignore_whitespace_change_at_eol}? (`boolean`) Ignore
                   whitespace change at end-of-line.
-                • {ignore_cr_at_eol} (`boolean`) Ignore carriage return at
+                • {ignore_cr_at_eol}? (`boolean`) Ignore carriage return at
                   end-of-line
-                • {ignore_blank_lines} (`boolean`) Ignore blank lines
-                • {indent_heuristic} (`boolean`) Use the indent heuristic for
+                • {ignore_blank_lines}? (`boolean`) Ignore blank lines
+                • {indent_heuristic}? (`boolean`) Use the indent heuristic for
                   the internal diff library.
 
     Return: ~
diff --git a/runtime/lua/vim/_meta/diff.lua b/runtime/lua/vim/_meta/diff.lua
index 617bc87f59..5a0de68119 100644
--- a/runtime/lua/vim/_meta/diff.lua
+++ b/runtime/lua/vim/_meta/diff.lua
@@ -11,19 +11,19 @@
 ---   - `count_a` (`integer`): Hunk size in {a}.
 ---   - `start_b` (`integer`): Start line of hunk in {b}.
 ---   - `count_b` (`integer`): Hunk size in {b}.
---- @field on_hunk fun(start_a: integer, count_a: integer, start_b: integer, count_b: integer): integer
+--- @field on_hunk? fun(start_a: integer, count_a: integer, start_b: integer, count_b: integer): integer
 ---
 --- Form of the returned diff:
 ---   - `unified`: String in unified format.
 ---   - `indices`: Array of hunk locations.
 --- Note: This option is ignored if `on_hunk` is used.
 --- (default: `'unified'`)
---- @field result_type 'unified'|'indices'
+--- @field result_type? 'unified'|'indices'
 ---
 --- Run linematch on the resulting hunks from xdiff. When integer, only hunks
 --- upto this size in lines are run through linematch.
 --- Requires `result_type = indices`, ignored otherwise.
---- @field linematch boolean|integer
+--- @field linematch? boolean|integer
 ---
 --- Diff algorithm to use. Values:
 ---   - `myers`: the default algorithm
@@ -31,15 +31,15 @@
 ---   - `patience`: patience diff algorithm
 ---   - `histogram`: histogram diff algorithm
 --- (default: `'myers'`)
---- @field algorithm 'myers'|'minimal'|'patience'|'histogram'
---- @field ctxlen integer Context length
---- @field interhunkctxlen integer Inter hunk context length
---- @field ignore_whitespace boolean Ignore whitespace
---- @field ignore_whitespace_change boolean Ignore whitespace change
---- @field ignore_whitespace_change_at_eol boolean Ignore whitespace change at end-of-line.
---- @field ignore_cr_at_eol boolean Ignore carriage return at end-of-line
---- @field ignore_blank_lines boolean Ignore blank lines
---- @field indent_heuristic boolean Use the indent heuristic for the internal diff library.
+--- @field algorithm? 'myers'|'minimal'|'patience'|'histogram'
+--- @field ctxlen? integer Context length
+--- @field interhunkctxlen? integer Inter hunk context length
+--- @field ignore_whitespace? boolean Ignore whitespace
+--- @field ignore_whitespace_change? boolean Ignore whitespace change
+--- @field ignore_whitespace_change_at_eol? boolean Ignore whitespace change at end-of-line.
+--- @field ignore_cr_at_eol? boolean Ignore carriage return at end-of-line
+--- @field ignore_blank_lines? boolean Ignore blank lines
+--- @field indent_heuristic? boolean Use the indent heuristic for the internal diff library.
 
 -- luacheck: no unused args
 
@@ -65,7 +65,7 @@
 ---
 ---@param a string First string to compare
 ---@param b string Second string to compare
----@param opts vim.diff.Opts
+---@param opts? vim.diff.Opts
 ---@return string|integer[][]?
 ---     See {opts.result_type}. `nil` if {opts.on_hunk} is given.
 function vim.diff(a, b, opts) end
diff --git a/src/clint.py b/src/clint.py
index 1ed01382c8..b57bbe354b 100755
--- a/src/clint.py
+++ b/src/clint.py
@@ -881,6 +881,7 @@ def CheckIncludes(filename, lines, error):
             "nvim/func_attr.h",
             "termkey/termkey.h",
             "vterm/vterm.h",
+            "xdiff/xdiff.h",
             ]
 
     for i in check_includes_ignore:
diff --git a/src/nvim/diff.c b/src/nvim/diff.c
index 05e5bed50c..d22fb65827 100644
--- a/src/nvim/diff.c
+++ b/src/nvim/diff.c
@@ -2005,7 +2005,7 @@ static void run_linematch_algorithm(diff_T *dp)
 {
   // define buffers for diff algorithm
   mmfile_t diffbufs_mm[DB_COUNT];
-  const char *diffbufs[DB_COUNT];
+  const mmfile_t *diffbufs[DB_COUNT];
   int diff_length[DB_COUNT];
   size_t ndiffs = 0;
   for (int i = 0; i < DB_COUNT; i++) {
@@ -2015,9 +2015,7 @@ static void run_linematch_algorithm(diff_T *dp)
       diff_write_buffer(curtab->tp_diffbuf[i], &diffbufs_mm[ndiffs],
                         dp->df_lnum[i], dp->df_lnum[i] + dp->df_count[i] - 1);
 
-      // we want to get the char* to the diff buffer that was just written
-      // we add it to the array of char*, diffbufs
-      diffbufs[ndiffs] = diffbufs_mm[ndiffs].ptr;
+      diffbufs[ndiffs] = &diffbufs_mm[ndiffs];
 
       // keep track of the length of this diff block to pass it to the linematch
       // algorithm
diff --git a/src/nvim/linematch.c b/src/nvim/linematch.c
index 24b2c82381..8943e6e8a6 100644
--- a/src/nvim/linematch.c
+++ b/src/nvim/linematch.c
@@ -10,6 +10,8 @@
 #include "nvim/macros_defs.h"
 #include "nvim/memory.h"
 #include "nvim/pos_defs.h"
+#include "nvim/strings.h"
+#include "xdiff/xdiff.h"
 
 #define LN_MAX_BUFS 8
 #define LN_DECISION_MAX 255  // pow(2, LN_MAX_BUFS(8)) - 1 = 255
@@ -29,49 +31,49 @@ struct diffcmppath_S {
 # include "linematch.c.generated.h"
 #endif
 
-static size_t line_len(const char *s)
+static size_t line_len(const mmfile_t *m)
 {
-  char *end = strchr(s, '\n');
+  char *s = m->ptr;
+  size_t n = (size_t)m->size;
+  char *end = strnchr(s, &n, '\n');
   if (end) {
     return (size_t)(end - s);
   }
-  return strlen(s);
+  return (size_t)m->size;
 }
 
+#define MATCH_CHAR_MAX_LEN 800
+
 /// Same as matching_chars but ignore whitespace
 ///
 /// @param s1
 /// @param s2
-static int matching_chars_iwhite(const char *s1, const char *s2)
+static int matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2)
 {
   // the newly processed strings that will be compared
-  // delete the white space characters, and/or replace all upper case with lower
-  char *strsproc[2];
-  const char *strsorig[2] = { s1, s2 };
+  // delete the white space characters
+  mmfile_t sp[2];
+  char p[2][MATCH_CHAR_MAX_LEN];
   for (int k = 0; k < 2; k++) {
-    size_t d = 0;
-    size_t i = 0;
-    size_t slen = line_len(strsorig[k]);
-    strsproc[k] = xmalloc((slen + 1) * sizeof(char));
-    while (d + i < slen) {
-      char e = strsorig[k][i + d];
+    const mmfile_t *s = k == 0 ? s1 : s2;
+    size_t pi = 0;
+    size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s));
+    for (size_t i = 0; i <= slen; i++) {
+      char e = s->ptr[i];
       if (e != ' ' && e != '\t') {
-        strsproc[k][i] = e;
-        i++;
-      } else {
-        d++;
+        p[k][pi] = e;
+        pi++;
       }
     }
-    strsproc[k][i] = NUL;
+
+    sp[k] = (mmfile_t){
+      .ptr = p[k],
+      .size = (int)pi,
+    };
   }
-  int matching = matching_chars(strsproc[0], strsproc[1]);
-  xfree(strsproc[0]);
-  xfree(strsproc[1]);
-  return matching;
+  return matching_chars(&sp[0], &sp[1]);
 }
 
-#define MATCH_CHAR_MAX_LEN 800
-
 /// Return matching characters between "s1" and "s2" whilst respecting sequence order.
 /// Consider the case of two strings 'AAACCC' and 'CCCAAA', the
 /// return value from this function will be 3, either to match
@@ -83,12 +85,14 @@ static int matching_chars_iwhite(const char *s1, const char *s2)
 ///   matching_chars("abcdefg", "gfedcba")         -> 1  // all characters in common,
 ///                                                      // but only at most 1 in sequence
 ///
-/// @param s1
-/// @param s2
-static int matching_chars(const char *s1, const char *s2)
+/// @param m1
+/// @param m2
+static int matching_chars(const mmfile_t *m1, const mmfile_t *m2)
 {
-  size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s1));
-  size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s2));
+  size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1));
+  size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2));
+  char *s1 = m1->ptr;
+  char *s2 = m2->ptr;
   int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 };
   bool icur = 1;  // save space by storing only two rows for i axis
   for (size_t i = 0; i < s1len; i++) {
@@ -119,13 +123,13 @@ static int matching_chars(const char *s1, const char *s2)
 /// @param sp
 /// @param fomvals
 /// @param n
-static int count_n_matched_chars(const char **sp, const size_t n, bool iwhite)
+static int count_n_matched_chars(mmfile_t **sp, const size_t n, bool iwhite)
 {
   int matched_chars = 0;
   int matched = 0;
   for (size_t i = 0; i < n; i++) {
     for (size_t j = i + 1; j < n; j++) {
-      if (sp[i] != NULL && sp[j] != NULL) {
+      if (sp[i]->ptr != NULL && sp[j]->ptr != NULL) {
         matched++;
         // TODO(lewis6991): handle whitespace ignoring higher up in the stack
         matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j])
@@ -143,15 +147,17 @@ static int count_n_matched_chars(const char **sp, const size_t n, bool iwhite)
   return matched_chars;
 }
 
-void fastforward_buf_to_lnum(const char **s, linenr_T lnum)
+mmfile_t fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum)
 {
   for (int i = 0; i < lnum - 1; i++) {
-    *s = strchr(*s, '\n');
-    if (!*s) {
-      return;
+    s.ptr = strnchr(s.ptr, (size_t *)&s.size, '\n');
+    if (!s.ptr) {
+      break;
     }
-    (*s)++;
+    s.ptr++;
+    s.size--;
   }
+  return s;
 }
 
 /// try all the different ways to compare these lines and use the one that
@@ -167,25 +173,25 @@ void fastforward_buf_to_lnum(const char **s, linenr_T lnum)
 /// @param diff_blk
 static void try_possible_paths(const int *df_iters, const size_t *paths, const int npaths,
                                const int path_idx, int *choice, diffcmppath_T *diffcmppath,
-                               const int *diff_len, const size_t ndiffs, const char **diff_blk,
+                               const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk,
                                bool iwhite)
 {
   if (path_idx == npaths) {
     if ((*choice) > 0) {
       int from_vals[LN_MAX_BUFS] = { 0 };
       const int *to_vals = df_iters;
-      const char *current_lines[LN_MAX_BUFS];
+      mmfile_t mm[LN_MAX_BUFS];  // stack memory for current_lines
+      mmfile_t *current_lines[LN_MAX_BUFS];
       for (size_t k = 0; k < ndiffs; k++) {
         from_vals[k] = df_iters[k];
         // get the index at all of the places
         if ((*choice) & (1 << k)) {
           from_vals[k]--;
-          const char *p = diff_blk[k];
-          fastforward_buf_to_lnum(&p, df_iters[k]);
-          current_lines[k] = p;
+          mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]);
         } else {
-          current_lines[k] = NULL;
+          mm[k] = (mmfile_t){ 0 };
         }
+        current_lines[k] = &mm[k];
       }
       size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs);
       size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs);
@@ -244,7 +250,7 @@ static size_t unwrap_indexes(const int *values, const int *diff_len, const size_
 /// @param ndiffs
 /// @param diff_blk
 static void populate_tensor(int *df_iters, const size_t ch_dim, diffcmppath_T *diffcmppath,
-                            const int *diff_len, const size_t ndiffs, const char **diff_blk,
+                            const int *diff_len, const size_t ndiffs, const mmfile_t **diff_blk,
                             bool iwhite)
 {
   if (ch_dim == ndiffs) {
@@ -327,7 +333,7 @@ static void populate_tensor(int *df_iters, const size_t ch_dim, diffcmppath_T *d
 /// @param ndiffs
 /// @param [out] [allocated] decisions
 /// @return the length of decisions
-size_t linematch_nbuffers(const char **diff_blk, const int *diff_len, const size_t ndiffs,
+size_t linematch_nbuffers(const mmfile_t **diff_blk, const int *diff_len, const size_t ndiffs,
                           int **decisions, bool iwhite)
 {
   assert(ndiffs <= LN_MAX_BUFS);
diff --git a/src/nvim/linematch.h b/src/nvim/linematch.h
index eaf0d54bec..5f6667a7df 100644
--- a/src/nvim/linematch.h
+++ b/src/nvim/linematch.h
@@ -3,6 +3,7 @@
 #include <stddef.h>  // IWYU pragma: keep
 
 #include "nvim/pos_defs.h"  // IWYU pragma: keep
+#include "xdiff/xdiff.h"
 
 #ifdef INCLUDE_GENERATED_DECLARATIONS
 # include "linematch.h.generated.h"
diff --git a/src/nvim/lua/xdiff.c b/src/nvim/lua/xdiff.c
index 8d791a7e74..b9f96abf73 100644
--- a/src/nvim/lua/xdiff.c
+++ b/src/nvim/lua/xdiff.c
@@ -67,11 +67,11 @@ static void get_linematch_results(lua_State *lstate, mmfile_t *ma, mmfile_t *mb,
                                   int count_a, int start_b, int count_b, bool iwhite)
 {
   // get the pointer to char of the start of the diff to pass it to linematch algorithm
-  const char *diff_begin[2] = { ma->ptr, mb->ptr };
-  int diff_length[2] = { count_a, count_b };
+  mmfile_t ma0 = fastforward_buf_to_lnum(*ma, (linenr_T)start_a + 1);
+  mmfile_t mb0 = fastforward_buf_to_lnum(*mb, (linenr_T)start_b + 1);
 
-  fastforward_buf_to_lnum(&diff_begin[0], (linenr_T)start_a + 1);
-  fastforward_buf_to_lnum(&diff_begin[1], (linenr_T)start_b + 1);
+  const mmfile_t *diff_begin[2] = { &ma0, &mb0 };
+  int diff_length[2] = { count_a, count_b };
 
   int *decisions = NULL;
   size_t decisions_length = linematch_nbuffers(diff_begin, diff_length, 2, &decisions, iwhite);
diff --git a/src/nvim/strings.c b/src/nvim/strings.c
index b7a87ae030..118abbae6d 100644
--- a/src/nvim/strings.c
+++ b/src/nvim/strings.c
@@ -496,6 +496,20 @@ char *vim_strchr(const char *const string, const int c)
   }
 }
 
+// Sized version of strchr that can handle embedded NULs.
+// Adjusts n to the new size.
+char *strnchr(const char *p, size_t *n, int c)
+{
+  while (*n > 0) {
+    if (*p == c) {
+      return (char *)p;
+    }
+    p++;
+    (*n)--;
+  }
+  return NULL;
+}
+
 // Sort an array of strings.
 
 static int sort_compare(const void *s1, const void *s2)
diff --git a/test/functional/lua/xdiff_spec.lua b/test/functional/lua/xdiff_spec.lua
index d5589c1f13..ed65193244 100644
--- a/test/functional/lua/xdiff_spec.lua
+++ b/test/functional/lua/xdiff_spec.lua
@@ -174,4 +174,13 @@ describe('xdiff bindings', function()
       pcall_err(exec_lua, [[vim.diff('a', 'b', { on_hunk = true })]])
     )
   end)
+
+  it('can handle strings with embedded NUL characters (GitHub #30305)', function()
+    eq(
+      { { 0, 0, 1, 1 }, { 1, 0, 3, 2 } },
+      exec_lua(function()
+        return vim.diff('\n', '\0\n\n\nb', { linematch = true, result_type = 'indices' })
+      end)
+    )
+  end)
 end)
author	Lewis Russell <lewis6991@gmail.com>	2024-09-26 16:10:11 +0100
committer	Lewis Russell <lewis6991@gmail.com>	2024-09-30 11:51:33 +0100
commit	c65646c2474d22948c604168a68f6626a645d1d2 (patch)
tree	fcf8e49ae06878638e0dacc34e08d7892ccc8524
parent	20251be15a4ad3f6e7016450ca3338d52b2f0951 (diff)
download	rneovim-c65646c2474d22948c604168a68f6626a645d1d2.tar.gz rneovim-c65646c2474d22948c604168a68f6626a645d1d2.tar.bz2 rneovim-c65646c2474d22948c604168a68f6626a645d1d2.zip