aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBjörn Linse <bjorn.linse@gmail.com>2019-08-04 12:22:22 +0200
committerBjörn Linse <bjorn.linse@gmail.com>2019-08-06 20:24:36 +0200
commitc0993ed3433ef4111a39e59642d15b15261e8b68 (patch)
treecb2a4225d70dec491f96412f96ebcb7c5a391e9a
parentb0e26199ec02c9b392af6161522004c55db0441f (diff)
downloadrneovim-c0993ed3433ef4111a39e59642d15b15261e8b68.tar.gz
rneovim-c0993ed3433ef4111a39e59642d15b15261e8b68.tar.bz2
rneovim-c0993ed3433ef4111a39e59642d15b15261e8b68.zip
lua: support getting UTF-32 and UTF-16 sizes of replaced text
-rw-r--r--runtime/doc/api.txt9
-rw-r--r--src/nvim/api/buffer.c11
-rw-r--r--src/nvim/buffer_defs.h17
-rw-r--r--src/nvim/buffer_updates.c16
-rw-r--r--src/nvim/fileio.c2
-rw-r--r--src/nvim/globals.h2
-rw-r--r--src/nvim/mbyte.c33
-rw-r--r--src/nvim/memline.c48
-rw-r--r--src/nvim/misc1.c4
-rw-r--r--test/functional/lua/buffer_updates_spec.lua80
10 files changed, 192 insertions, 30 deletions
diff --git a/runtime/doc/api.txt b/runtime/doc/api.txt
index f4366cb1af..2c6b053994 100644
--- a/runtime/doc/api.txt
+++ b/runtime/doc/api.txt
@@ -208,14 +208,17 @@ they are allowed.
|nvim_buf_attach| will take keyword args for the callbacks. "on_lines" will
receive parameters ("lines", {buf}, {changedtick}, {firstline}, {lastline},
-{new_lastline}, {old_bytecount}).
+{new_lastline}, {old_byte_size}[, {old_utf32_size}, {old_utf16_size}]).
Unlike remote channel events the text contents are not passed. The new text can
be accessed inside the callback as
`vim.api.nvim_buf_get_lines(buf, firstline, new_lastline, true)`
-{old_bytecount} is the total size of the replaced region {firstline} to
-{lastline} in bytes, including the final newline after {lastline}.
+{old_byte_size} is the total size of the replaced region {firstline} to
+{lastline} in bytes, including the final newline after {lastline}. if
+`utf_sizes` is set to true in |nvim_buf_attach()| keyword args, then the
+UTF-32 and UTF-16 sizes of the deleted region is also passed as additional
+arguments {old_utf32_size} and {old_utf16_size}.
"on_changedtick" is invoked when |b:changedtick| was incremented but no text
was changed. The parameters recieved are ("changedtick", {buf}, {changedtick}).
diff --git a/src/nvim/api/buffer.c b/src/nvim/api/buffer.c
index 497b4ae9a4..c6f82e9d85 100644
--- a/src/nvim/api/buffer.c
+++ b/src/nvim/api/buffer.c
@@ -109,9 +109,11 @@ String buffer_get_line(Buffer buffer, Integer index, Error *err)
/// `nvim_buf_lines_event`. Otherwise, the first notification will be
/// a `nvim_buf_changedtick_event`. Not used for lua callbacks.
/// @param opts Optional parameters.
-/// `on_lines`: lua callback received on change.
+/// `on_lines`: lua callback received on change.
/// `on_changedtick`: lua callback received on changedtick
/// increment without text change.
+/// `utf_sizes`: include UTF-32 and UTF-16 size of
+/// the replaced region.
/// See |api-buffer-updates-lua| for more information
/// @param[out] err Error details, if any
/// @return False when updates couldn't be enabled because the buffer isn't
@@ -156,6 +158,12 @@ Boolean nvim_buf_attach(uint64_t channel_id,
}
cb.on_detach = v->data.luaref;
v->data.integer = LUA_NOREF;
+ } else if (is_lua && strequal("utf_sizes", k.data)) {
+ if (v->type != kObjectTypeBoolean) {
+ api_set_error(err, kErrorTypeValidation, "utf_sizes must be boolean");
+ goto error;
+ }
+ cb.utf_sizes = v->data.boolean;
} else {
api_set_error(err, kErrorTypeValidation, "unexpected key: %s", k.data);
goto error;
@@ -1196,6 +1204,7 @@ Dictionary nvim__buf_stats(Buffer buffer, Error *err)
// NB: this should be zero at any time API functions are called,
// this exists to debug issues
PUT(rv, "dirty_bytes", INTEGER_OBJ((Integer)buf->deleted_bytes));
+
return rv;
}
diff --git a/src/nvim/buffer_defs.h b/src/nvim/buffer_defs.h
index eb26e4ad8e..b11eaefdd0 100644
--- a/src/nvim/buffer_defs.h
+++ b/src/nvim/buffer_defs.h
@@ -459,8 +459,9 @@ typedef struct {
LuaRef on_lines;
LuaRef on_changedtick;
LuaRef on_detach;
+ bool utf_sizes;
} BufUpdateCallbacks;
-#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF }
+#define BUF_UPDATE_CALLBACKS_INIT { LUA_NOREF, LUA_NOREF, LUA_NOREF, false }
#define BUF_HAS_QF_ENTRY 1
#define BUF_HAS_LL_ENTRY 2
@@ -802,12 +803,24 @@ struct file_buffer {
kvec_t(BufhlLine *) b_bufhl_move_space; // temporary space for highlights
- // array of channelids which have asked to receive updates for this
+ // array of channel_id:s which have asked to receive updates for this
// buffer.
kvec_t(uint64_t) update_channels;
+ // array of lua callbacks for buffer updates.
kvec_t(BufUpdateCallbacks) update_callbacks;
+ // whether an update callback has requested codepoint size of deleted regions.
+ bool update_need_codepoints;
+
+ // Measurements of the deleted or replaced region since the last update
+ // event. Some consumers of buffer changes need to know the byte size (like
+ // tree-sitter) or the corresponding UTF-32/UTF-16 size (like LSP) of the
+ // deleted text.
size_t deleted_bytes;
+ size_t deleted_codepoints;
+ size_t deleted_codeunits;
+
+ // The number for times the current line has been flushed in the memline.
int flush_count;
int b_diff_failed; // internal diff failed for this buffer
diff --git a/src/nvim/buffer_updates.c b/src/nvim/buffer_updates.c
index 7dea8bfac5..3604578b50 100644
--- a/src/nvim/buffer_updates.c
+++ b/src/nvim/buffer_updates.c
@@ -26,6 +26,9 @@ bool buf_updates_register(buf_T *buf, uint64_t channel_id,
if (channel_id == LUA_INTERNAL_CALL) {
kv_push(buf->update_callbacks, cb);
+ if (cb.utf_sizes) {
+ buf->update_need_codepoints = true;
+ }
return true;
}
@@ -169,7 +172,9 @@ void buf_updates_send_changes(buf_T *buf,
int64_t num_removed,
bool send_tick)
{
- size_t deleted_bytes = ml_flush_deleted_bytes(buf);
+ size_t deleted_codepoints, deleted_codeunits;
+ size_t deleted_bytes = ml_flush_deleted_bytes(buf, &deleted_codepoints,
+ &deleted_codeunits);
if (!buf_updates_active(buf)) {
return;
@@ -233,8 +238,8 @@ void buf_updates_send_changes(buf_T *buf,
bool keep = true;
if (cb.on_lines != LUA_NOREF) {
Array args = ARRAY_DICT_INIT;
- Object items[6];
- args.size = 6;
+ Object items[8];
+ args.size = 6; // may be increased to 8 below
args.items = items;
// the first argument is always the buffer handle
@@ -254,6 +259,11 @@ void buf_updates_send_changes(buf_T *buf,
// byte count of previous contents
args.items[5] = INTEGER_OBJ((Integer)deleted_bytes);
+ if (cb.utf_sizes) {
+ args.size = 8;
+ args.items[6] = INTEGER_OBJ((Integer)deleted_codepoints);
+ args.items[7] = INTEGER_OBJ((Integer)deleted_codeunits);
+ }
textlock++;
Object res = executor_exec_lua_cb(cb.on_lines, "lines", args, true);
textlock--;
diff --git a/src/nvim/fileio.c b/src/nvim/fileio.c
index 2232de8c1e..d03b9138d0 100644
--- a/src/nvim/fileio.c
+++ b/src/nvim/fileio.c
@@ -1756,6 +1756,8 @@ failed:
linecnt--;
}
curbuf->deleted_bytes = 0;
+ curbuf->deleted_codepoints = 0;
+ curbuf->deleted_codeunits = 0;
linecnt = curbuf->b_ml.ml_line_count - linecnt;
if (filesize == 0)
linecnt = 0;
diff --git a/src/nvim/globals.h b/src/nvim/globals.h
index de6f59b3f1..4524c4b2c0 100644
--- a/src/nvim/globals.h
+++ b/src/nvim/globals.h
@@ -627,6 +627,8 @@ EXTERN pos_T Insstart_orig;
EXTERN int orig_line_count INIT(= 0); /* Line count when "gR" started */
EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */
+// increase around internal delete/replace
+EXTERN int inhibit_delete_count INIT(= 0);
/*
* These flags are set based upon 'fileencoding'.
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index e7579399f3..bf8ce46113 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1438,6 +1438,39 @@ int utf16_to_utf8(const wchar_t *strw, char **str)
#endif
+/// Measure the length of a string in corresponding UTF-32 and UTF-16 units.
+///
+/// Invalid UTF-8 bytes, or embedded surrogates, count as one code point/unit
+/// each.
+///
+/// The out parameters are incremented. This is used to measure the size of
+/// a buffer region consisting of multiple line segments.
+///
+/// @param s the string
+/// @param len maximum length (an earlier NUL terminates)
+/// @param[out] codepoints incremented with UTF-32 code point size
+/// @param[out] codeunits incremented with UTF-16 code unit size
+void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
+ size_t *codeunits)
+ FUNC_ATTR_NONNULL_ALL
+{
+ size_t count = 0, extra = 0;
+ size_t clen;
+ for (size_t i = 0; i < len && s[i] != NUL; i += clen) {
+ clen = utf_ptr2len_len(s+i, len-i);
+ // NB: gets the byte value of invalid sequence bytes.
+ // we only care whether the char fits in the BMP or not
+ int c = (clen > 1) ? utf_ptr2char(s+i) : s[i];
+ count++;
+ if (c > 0xFFFF) {
+ extra++;
+ }
+ }
+ *codepoints += count;
+ *codeunits += count + extra;
+}
+
+
/*
* Version of strnicmp() that handles multi-byte characters.
* Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can
diff --git a/src/nvim/memline.c b/src/nvim/memline.c
index 0b16f86416..3220c7d9b8 100644
--- a/src/nvim/memline.c
+++ b/src/nvim/memline.c
@@ -2383,6 +2383,23 @@ static int ml_append_int(
return OK;
}
+void ml_add_deleted_len(char_u *ptr, ssize_t len)
+{
+ if (inhibit_delete_count) {
+ return;
+ }
+ if (len == -1) {
+ len = STRLEN(ptr);
+ }
+ curbuf->deleted_bytes += len+1;
+ if (curbuf->update_need_codepoints) {
+ mb_utflen(ptr, len, &curbuf->deleted_codepoints,
+ &curbuf->deleted_codeunits);
+ curbuf->deleted_codepoints++; // NL char
+ curbuf->deleted_codeunits++;
+ }
+}
+
/*
* Replace line lnum, with buffering, in current buffer.
*
@@ -2408,19 +2425,17 @@ int ml_replace(linenr_T lnum, char_u *line, bool copy)
if (copy) {
line = vim_strsave(line);
}
- if (curbuf->b_ml.ml_line_lnum != lnum) { /* other line buffered */
- ml_flush_line(curbuf); /* flush it */
- } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { /* same line allocated */
- // TODO FIXME: see other "TODO FIXME"
- curbuf->deleted_bytes += STRLEN(curbuf->b_ml.ml_line_ptr)+1;
- xfree(curbuf->b_ml.ml_line_ptr); /* free it */
- readlen = false; // already read it.
+ if (curbuf->b_ml.ml_line_lnum != lnum) { // other line buffered
+ ml_flush_line(curbuf); // flush it
+ } else if (curbuf->b_ml.ml_flags & ML_LINE_DIRTY) { // same line allocated
+ ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, -1);
+ readlen = false; // already added the length
+
+ xfree(curbuf->b_ml.ml_line_ptr); // free it
}
- if (readlen) {
- if (true) { // TODO: buffer updates active
- curbuf->deleted_bytes += STRLEN(ml_get_buf(curbuf, lnum, false))+1;
- }
+ if (readlen && kv_size(curbuf->update_callbacks)) {
+ ml_add_deleted_len(ml_get_buf(curbuf, lnum, false), -1);
}
curbuf->b_ml.ml_line_ptr = line;
@@ -2504,7 +2519,10 @@ static int ml_delete_int(buf_T *buf, linenr_T lnum, bool message)
else
line_size = ((dp->db_index[idx - 1]) & DB_INDEX_MASK) - line_start;
- buf->deleted_bytes += line_size;
+ // Line should always have an NL char internally (represented as NUL),
+ // even if 'noeol' is set.
+ assert(line_size >= 1);
+ ml_add_deleted_len((char_u *)dp + line_start, line_size-1);
/*
* special case: If there is only one line in the data block it becomes empty.
@@ -2690,10 +2708,14 @@ void ml_clearmarked(void)
return;
}
-size_t ml_flush_deleted_bytes(buf_T *buf)
+size_t ml_flush_deleted_bytes(buf_T *buf, size_t *codepoints, size_t *codeunits)
{
size_t ret = buf->deleted_bytes;
+ *codepoints = buf->deleted_codepoints;
+ *codeunits = buf->deleted_codeunits;
buf->deleted_bytes = 0;
+ buf->deleted_codepoints = 0;
+ buf->deleted_codeunits = 0;
return ret;
}
diff --git a/src/nvim/misc1.c b/src/nvim/misc1.c
index 112ca6f287..a62fa6d585 100644
--- a/src/nvim/misc1.c
+++ b/src/nvim/misc1.c
@@ -780,6 +780,7 @@ open_line (
did_append = FALSE;
}
+ inhibit_delete_count++;
if (newindent
|| did_si
) {
@@ -821,6 +822,7 @@ open_line (
did_si = false;
}
}
+ inhibit_delete_count--;
/*
* In REPLACE mode, for each character in the extra leader, there must be
@@ -1685,7 +1687,7 @@ int del_bytes(colnr_T count, bool fixpos_arg, bool use_delcombine)
bool was_alloced = ml_line_alloced(); // check if oldp was allocated
char_u *newp;
if (was_alloced) {
- curbuf->deleted_bytes += (size_t)oldlen+1;
+ ml_add_deleted_len(curbuf->b_ml.ml_line_ptr, oldlen);
newp = oldp; // use same allocated memory
} else { // need to allocate a new line
newp = xmalloc((size_t)(oldlen + 1 - count));
diff --git a/test/functional/lua/buffer_updates_spec.lua b/test/functional/lua/buffer_updates_spec.lua
index 16c38bc20b..990cb97fec 100644
--- a/test/functional/lua/buffer_updates_spec.lua
+++ b/test/functional/lua/buffer_updates_spec.lua
@@ -13,7 +13,8 @@ local origlines = {"original line 1",
"original line 3",
"original line 4",
"original line 5",
- "original line 6"}
+ "original line 6",
+ " indented line"}
describe('lua: buffer event callbacks', function()
before_each(function()
@@ -21,14 +22,14 @@ describe('lua: buffer event callbacks', function()
exec_lua([[
local events = {}
- function test_register(bufnr, id, changedtick)
+ function test_register(bufnr, id, changedtick, utf_sizes)
local function callback(...)
table.insert(events, {id, ...})
if test_unreg == id then
return true
end
end
- local opts = {on_lines=callback, on_detach=callback}
+ local opts = {on_lines=callback, on_detach=callback, utf_sizes=utf_sizes}
if changedtick then
opts.on_changedtick = callback
end
@@ -48,18 +49,26 @@ describe('lua: buffer event callbacks', function()
-- assert the wrong thing), but masks errors with unflushed lines (as
-- nvim_buf_get_offset forces a flush of the memline). To be safe run the
-- test both ways.
- local function check(verify)
+ local function check(verify,utf_sizes)
local lastsize
meths.buf_set_lines(0, 0, -1, true, origlines)
if verify then
lastsize = meths.buf_get_offset(0, meths.buf_line_count(0))
end
- exec_lua("return test_register(...)", 0, "test1")
+ exec_lua("return test_register(...)", 0, "test1",false,utf_sizes)
local tick = meths.buf_get_changedtick(0)
local verify_name = "test1"
local function check_events(expected)
local events = exec_lua("return get_events(...)" )
+ if utf_sizes then
+ -- this test case uses ASCII only, so sizes sshould be the same.
+ -- Unicode is tested below.
+ for _, event in ipairs(expected) do
+ event[9] = event[8]
+ event[10] = event[8]
+ end
+ end
eq(expected, events)
if verify then
for _, event in ipairs(events) do
@@ -75,6 +84,7 @@ describe('lua: buffer event callbacks', function()
end
end
+ command('set autoindent')
command('normal! GyyggP')
tick = tick + 1
check_events({{ "test1", "lines", 1, tick, 0, 0, 1, 0}})
@@ -83,7 +93,7 @@ describe('lua: buffer event callbacks', function()
tick = tick + 1
check_events({{ "test1", "lines", 1, tick, 3, 5, 4, 32 }})
- exec_lua("return test_register(...)", 0, "test2", true)
+ exec_lua("return test_register(...)", 0, "test2", true, utf_sizes)
tick = tick + 1
command('undo')
@@ -124,7 +134,13 @@ describe('lua: buffer event callbacks', function()
tick = tick + 1
check_events({{ "test2", "lines", 1, tick, 4, 5, 5, 19 }})
- feed('<esc>')
+ feed('<esc>Go')
+ tick = tick + 1
+ check_events({{ "test2", "lines", 1, tick, 11, 11, 12, 0 }})
+
+ feed('x')
+ tick = tick + 1
+ check_events({{ "test2", "lines", 1, tick, 11, 12, 12, 5 }})
command('bwipe!')
check_events({{ "test2", "detach", 1 }})
@@ -137,4 +153,54 @@ describe('lua: buffer event callbacks', function()
it('works with verify', function()
check(true)
end)
+
+ it('works with utf_sizes and ASCII text', function()
+ check(false,true)
+ end)
+
+ it('works with utf_sizes and unicode text', function()
+ local unicode_text = {"ascii text",
+ "latin text åäö",
+ "BMP text ɧ αλφά",
+ "BMP text 汉语 ↥↧",
+ "SMP 🤦 🦄🦃",
+ "combining å بِيَّة"}
+ meths.buf_set_lines(0, 0, -1, true, unicode_text)
+ feed('gg')
+ exec_lua("return test_register(...)", 0, "test1", false, true)
+ local tick = meths.buf_get_changedtick(0)
+
+ feed('dd')
+ tick = tick + 1
+ eq({{ "test1", "lines", 1, tick, 0, 1, 0, 11, 11, 11 }}, exec_lua("return get_events(...)" ))
+
+ feed('A<bs>')
+ tick = tick + 1
+ eq({{ "test1", "lines", 1, tick, 0, 1, 1, 18, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+ feed('<esc>jylp')
+ tick = tick + 1
+ eq({{ "test1", "lines", 1, tick, 1, 2, 2, 21, 16, 16 }}, exec_lua("return get_events(...)" ))
+
+ feed('+eea<cr>')
+ tick = tick + 1
+ eq({{ "test1", "lines", 1, tick, 2, 3, 4, 23, 15, 15 }}, exec_lua("return get_events(...)" ))
+
+ feed('<esc>jdw')
+ tick = tick + 1
+ -- non-BMP chars count as 2 UTF-2 codeunits
+ eq({{ "test1", "lines", 1, tick, 4, 5, 5, 18, 9, 12 }}, exec_lua("return get_events(...)" ))
+
+ feed('+rx')
+ tick = tick + 1
+ -- count the individual codepoints of a composed character.
+ eq({{ "test1", "lines", 1, tick, 5, 6, 6, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+
+ feed('kJ')
+ tick = tick + 1
+ -- NB: this is inefficient (but not really wrong).
+ eq({{ "test1", "lines", 1, tick, 4, 5, 5, 14, 5, 8 },
+ { "test1", "lines", 1, tick+1, 5, 6, 5, 27, 20, 20 }}, exec_lua("return get_events(...)" ))
+ end)
+
end)