diff options
author | bfredl <bjorn.linse@gmail.com> | 2023-09-19 14:30:02 +0200 |
---|---|---|
committer | bfredl <bjorn.linse@gmail.com> | 2023-10-08 15:22:45 +0200 |
commit | ddef39299f357d3131644647379e88a69749bf40 (patch) | |
tree | 617b46a628180bf4fa5cc56ba8de20b876810163 /src/nvim/grid.c | |
parent | 5db076c7ccfef6732516074252ac4b21b12fc629 (diff) | |
download | rneovim-ddef39299f357d3131644647379e88a69749bf40.tar.gz rneovim-ddef39299f357d3131644647379e88a69749bf40.tar.bz2 rneovim-ddef39299f357d3131644647379e88a69749bf40.zip |
refactor(grid): do arabic shaping in one place
The 'arabicshape' feature of vim is a transformation of unicode text to
make arabic and some related scripts look better at display time. In
particular the content of a cell will be adjusted depending on the
(original) content of the cells just before and after it.
This is implemented by the arabic_shape() function in nvim. Before this
commit, shaping was invoked in four different contexts:
- when rendering buffer text in win_line()
- in line_putchar() for rendering virtual text
- as part of grid_line_puts, used by messages and statuslines and
similar
- as part of draw_cmdline() for drawing the cmdline
This replaces all these with a post-processing step in grid_put_linebuf(),
which has become the entry point for all text rendering after recent
refactors.
An aim of this is to make the handling of multibyte text yet simpler.
One of the main reasons multibyte chars needs to be "parsed" into
codepoint arrays of composing chars is so that these could be inspected
for the purpose of shaping. This can likely be vastly simplified in many
contexts where only the total length (in bytes) and width of composed
char is needed.
Diffstat (limited to 'src/nvim/grid.c')
-rw-r--r-- | src/nvim/grid.c | 154 |
1 files changed, 116 insertions, 38 deletions
diff --git a/src/nvim/grid.c b/src/nvim/grid.c index 2eab158bc4..7c8823e0d4 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -152,17 +152,17 @@ bool schar_high(schar_T sc) #endif } -void schar_get(char *buf_out, schar_T sc) -{ - if (schar_high(sc)) { #ifdef ORDER_BIG_ENDIAN - uint32_t idx = sc & (0x00FFFFFF); +# define schar_idx(sc) (sc & (0x00FFFFFF)) #else - uint32_t idx = sc >> 8; +# define schar_idx(sc) (sc >> 8) #endif - if (idx >= glyph_cache.h.n_keys) { - abort(); - } + +void schar_get(char *buf_out, schar_T sc) +{ + if (schar_high(sc)) { + uint32_t idx = schar_idx(sc); + assert(idx < glyph_cache.h.n_keys); xstrlcpy(buf_out, &glyph_cache.keys[idx], 32); } else { memcpy(buf_out, (char *)&sc, 4); @@ -170,6 +170,13 @@ void schar_get(char *buf_out, schar_T sc) } } +/// gets first raw UTF-8 byte of an schar +static char schar_get_first_byte(schar_T sc) +{ + assert(!(schar_high(sc) && schar_idx(sc) >= glyph_cache.h.n_keys)); + return schar_high(sc) ? glyph_cache.keys[schar_idx(sc)] : *(char *)≻ +} + /// @return ascii char or NUL if not ascii char schar_get_ascii(schar_T sc) { @@ -179,6 +186,90 @@ char schar_get_ascii(schar_T sc) return (sc < 0x80) ? (char)sc : NUL; #endif } + +static bool schar_in_arabic_block(schar_T sc) +{ + char first_byte = schar_get_first_byte(sc); + return ((uint8_t)first_byte & 0xFE) == 0xD8; +} + +/// Get the first two codepoints of an schar, or NUL when not available +static void schar_get_first_two_codepoints(schar_T sc, int *c0, int *c1) +{ + char sc_buf[MAX_SCHAR_SIZE]; + schar_get(sc_buf, sc); + + *c0 = utf_ptr2char(sc_buf); + int len = utf_ptr2len(sc_buf); + if (*c0 == NUL) { + *c1 = NUL; + } else { + *c1 = utf_ptr2char(sc_buf + len); + } +} + +void line_do_arabic_shape(schar_T *buf, int cols) +{ + int i = 0; + + for (i = 0; i < cols; i++) { + // quickly skip over non-arabic text + if (schar_in_arabic_block(buf[i])) { + break; + } + } + + if (i == cols) { + return; + } + + int c0prev = 0; + int c0, c1; + schar_get_first_two_codepoints(buf[i], &c0, &c1); + + for (; i < cols; i++) { + int c0next, c1next; + schar_get_first_two_codepoints(i + 1 < cols ? buf[i + 1] : 0, &c0next, &c1next); + + if (!ARABIC_CHAR(c0)) { + goto next; + } + + int c1new = c1; + int c0new = arabic_shape(c0, &c1new, c0next, c1next, c0prev); + + if (c0new == c0 && c1new == c1) { + goto next; // unchanged + } + + char scbuf[MAX_SCHAR_SIZE]; + schar_get(scbuf, buf[i]); + + char scbuf_new[MAX_SCHAR_SIZE]; + int len = utf_char2bytes(c0new, scbuf_new); + if (c1new) { + len += utf_char2bytes(c1new, scbuf_new + len); + } + + int off = utf_char2len(c0) + (c1 ? utf_char2len(c1) : 0); + size_t rest = strlen(scbuf + off); + if (rest + (size_t)off + 1 > MAX_SCHAR_SIZE) { + // TODO(bfredl): this cannot happen just yet, as we only construct + // schar_T values with up to MAX_MCO+1 composing codepoints. When code + // is improved so that MAX_SCHAR_SIZE becomes the only/sharp limit, + // we need be able to peel off a composing char which doesn't fit anymore. + abort(); + } + memcpy(scbuf_new + len, scbuf + off, rest); + buf[i] = schar_from_buf(scbuf_new, (size_t)len + rest); + +next: + c0prev = c0; + c0 = c0next; + c1 = c1next; + } +} + /// clear a line in the grid starting at "off" until "width" characters /// are cleared. void grid_clear_line(ScreenGrid *grid, size_t off, int width, bool valid) @@ -242,6 +333,15 @@ void grid_line_start(ScreenGrid *grid, int row) grid_line_first = (int)linebuf_size; grid_line_maxcol = grid->cols - grid_line_coloff; grid_line_last = 0; + + assert((size_t)grid_line_maxcol <= linebuf_size); + + if (rdb_flags & RDB_INVALID) { + // Current batch must not depend on previous contents of linebuf_char. + // Set invalid values which will cause assertion failures later if they are used. + memset(linebuf_char, 0xFF, sizeof(schar_T) * linebuf_size); + memset(linebuf_attr, 0xFF, sizeof(sattr_T) * linebuf_size); + } } /// Get present char from current rendered screen line @@ -287,11 +387,7 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) { const char *ptr = text; int len = textlen; - int c; int u8cc[MAX_MCO]; - int prev_c = 0; // previous Arabic character - int pc, nc, nc1; - int pcc[MAX_MCO]; assert(grid_line_grid); @@ -301,7 +397,6 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) while (col < max_col && (len < 0 || (int)(ptr - text) < len) && *ptr != NUL) { - c = (unsigned char)(*ptr); // check if this is the first byte of a multibyte int mbyte_blen = len > 0 ? utfc_ptr2len_len(ptr, (int)((text + len) - ptr)) @@ -316,37 +411,16 @@ int grid_line_puts(int col, const char *text, int textlen, int attr) u8cc[0] = 0; } - if (p_arshape && !p_tbidi && ARABIC_CHAR(u8c)) { - // Do Arabic shaping. - if (len >= 0 && (int)(ptr - text) + mbyte_blen >= len) { - // Past end of string to be displayed. - nc = NUL; - nc1 = NUL; - } else { - nc = len >= 0 - ? utfc_ptr2char_len(ptr + mbyte_blen, pcc, - (int)((text + len) - ptr - mbyte_blen)) - : utfc_ptr2char(ptr + mbyte_blen, pcc); - nc1 = pcc[0]; - } - pc = prev_c; - prev_c = u8c; - u8c = arabic_shape(u8c, &c, &u8cc[0], nc, nc1, pc); - } else { - prev_c = u8c; - } if (col + mbyte_cells > max_col) { // Only 1 cell left, but character requires 2 cells: // display a '>' in the last column to avoid wrapping. */ - c = '>'; u8c = '>'; u8cc[0] = 0; mbyte_cells = 1; } schar_T buf; - // TODO(bfredl): why not just keep the original byte sequence. arabshape is - // an edge case, treat it as such.. + // TODO(bfredl): why not just keep the original byte sequence. buf = schar_from_cc(u8c, u8cc); // When at the start of the text and overwriting the right half of a @@ -545,14 +619,12 @@ static int grid_char_needs_redraw(ScreenGrid *grid, int col, size_t off_to, int /// If "wrap" is true, then hint to the UI that "row" contains a line /// which has wrapped into the next row. void grid_put_linebuf(ScreenGrid *grid, int row, int coloff, int col, int endcol, int clear_width, - int rl, int bg_attr, bool wrap, bool invalid_row) + bool rl, int bg_attr, bool wrap, bool invalid_row) { bool redraw_next; // redraw_this for next character bool clear_next = false; int char_cells; // 1: normal char // 2: occupies two display cells - int start_dirty = -1, end_dirty = 0; - assert(0 <= row && row < grid->rows); // TODO(bfredl): check all callsites and eliminate // Check for illegal col, just in case @@ -591,6 +663,10 @@ void grid_put_linebuf(ScreenGrid *grid, int row, int coloff, int col, int endcol endcol = (clear_width > 0 ? clear_width : -clear_width); } + if (p_arshape && !p_tbidi) { + line_do_arabic_shape(linebuf_char + col, endcol - col); + } + if (bg_attr) { for (int c = col; c < endcol; c++) { linebuf_attr[c] = hl_combine_attr(bg_attr, linebuf_attr[c]); @@ -599,6 +675,8 @@ void grid_put_linebuf(ScreenGrid *grid, int row, int coloff, int col, int endcol redraw_next = grid_char_needs_redraw(grid, col, (size_t)col + off_to, endcol - col); + int start_dirty = -1, end_dirty = 0; + while (col < endcol) { char_cells = 1; if (col + 1 < endcol && linebuf_char[col + 1] == 0) { |