aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
authorJosh Rahm <joshuarahm@gmail.com>2023-11-29 22:39:54 +0000
committerJosh Rahm <joshuarahm@gmail.com>2023-11-29 22:39:54 +0000
commit21cb7d04c387e4198ca8098a884c78b56ffcf4c2 (patch)
tree84fe5690df1551f0bb2bdfe1a13aacd29ebc1de7 /src/nvim/mbyte.c
parentd9c904f85a23a496df4eb6be42aa43f007b22d50 (diff)
parent4a8bf24ac690004aedf5540fa440e788459e5e34 (diff)
downloadrneovim-colorcolchar.tar.gz
rneovim-colorcolchar.tar.bz2
rneovim-colorcolchar.zip
Merge remote-tracking branch 'upstream/master' into colorcolcharcolorcolchar
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c466
1 files changed, 229 insertions, 237 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 8b50ba719a..f2883cc5c7 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1,6 +1,3 @@
-// This is an open source non-commercial project. Dear PVS-Studio, please check
-// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
-
/// mbyte.c: Code specifically for handling multi-byte characters.
/// Multibyte extensions partly by Sung-Hoon Baek
///
@@ -29,18 +26,21 @@
#include <ctype.h>
#include <errno.h>
#include <iconv.h>
+#include <locale.h>
#include <stdbool.h>
+#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <wchar.h>
+#include <sys/types.h>
#include <wctype.h>
#include "auto/config.h"
#include "nvim/arabic.h"
-#include "nvim/ascii.h"
+#include "nvim/ascii_defs.h"
#include "nvim/buffer_defs.h"
#include "nvim/charset.h"
+#include "nvim/cmdexpand_defs.h"
#include "nvim/cursor.h"
#include "nvim/drawscreen.h"
#include "nvim/eval/typval.h"
@@ -48,28 +48,23 @@
#include "nvim/getchar.h"
#include "nvim/gettext.h"
#include "nvim/globals.h"
-#include "nvim/grid_defs.h"
-#include "nvim/iconv.h"
+#include "nvim/grid.h"
+#include "nvim/iconv_defs.h"
#include "nvim/keycodes.h"
-#include "nvim/macros.h"
+#include "nvim/macros_defs.h"
#include "nvim/mark.h"
#include "nvim/mbyte.h"
#include "nvim/mbyte_defs.h"
#include "nvim/memline.h"
#include "nvim/memory.h"
#include "nvim/message.h"
-#include "nvim/option_defs.h"
+#include "nvim/option_vars.h"
+#include "nvim/optionstr.h"
#include "nvim/os/os.h"
-#include "nvim/os/os_defs.h"
-#include "nvim/pos.h"
-#include "nvim/screen.h"
+#include "nvim/pos_defs.h"
#include "nvim/strings.h"
-#include "nvim/types.h"
-#include "nvim/vim.h"
-
-#ifdef HAVE_LOCALE_H
-# include <locale.h>
-#endif
+#include "nvim/types_defs.h"
+#include "nvim/vim_defs.h"
typedef struct {
int rangeStart;
@@ -79,8 +74,8 @@ typedef struct {
} convertStruct;
struct interval {
- long first;
- long last;
+ int first;
+ int last;
};
// uncrustify:off
@@ -90,17 +85,17 @@ struct interval {
#endif
// uncrustify:on
-static char e_list_item_nr_is_not_list[]
+static const char e_list_item_nr_is_not_list[]
= N_("E1109: List item %d is not a List");
-static char e_list_item_nr_does_not_contain_3_numbers[]
+static const char e_list_item_nr_does_not_contain_3_numbers[]
= N_("E1110: List item %d does not contain 3 numbers");
-static char e_list_item_nr_range_invalid[]
+static const char e_list_item_nr_range_invalid[]
= N_("E1111: List item %d range invalid");
-static char e_list_item_nr_cell_width_invalid[]
+static const char e_list_item_nr_cell_width_invalid[]
= N_("E1112: List item %d cell width invalid");
-static char e_overlapping_ranges_for_nr[]
+static const char e_overlapping_ranges_for_nr[]
= N_("E1113: Overlapping ranges for 0x%lx");
-static char e_only_values_of_0x80_and_higher_supported[]
+static const char e_only_values_of_0x80_and_higher_supported[]
= N_("E1114: Only values of 0x80 and higher supported");
// To speed up BYTELEN(); keep a lookup table to quickly get the length in
@@ -370,7 +365,7 @@ static int enc_canon_search(const char *name)
int enc_canon_props(const char *name)
FUNC_ATTR_PURE
{
- int i = enc_canon_search((char *)name);
+ int i = enc_canon_search(name);
if (i >= 0) {
return enc_canon_table[i].prop;
} else if (strncmp(name, "2byte-", 6) == 0) {
@@ -449,18 +444,16 @@ int mb_get_class_tab(const char *p, const uint64_t *const chartab)
static bool intable(const struct interval *table, size_t n_items, int c)
FUNC_ATTR_PURE
{
- int mid, bot, top;
-
// first quick check for Latin1 etc. characters
if (c < table[0].first) {
return false;
}
// binary search in table
- bot = 0;
- top = (int)(n_items - 1);
+ int bot = 0;
+ int top = (int)(n_items - 1);
while (top >= bot) {
- mid = (bot + top) / 2;
+ int mid = (bot + top) / 2;
if (table[mid].last < c) {
bot = mid + 1;
} else if (table[mid].first > c) {
@@ -518,11 +511,9 @@ int utf_char2cells(int c)
/// This doesn't take care of unprintable characters, use ptr2cells() for that.
int utf_ptr2cells(const char *p)
{
- int c;
-
// Need to convert to a character number.
if ((uint8_t)(*p) >= 0x80) {
- c = utf_ptr2char(p);
+ int c = utf_ptr2char(p);
// An illegal byte is displayed as <xx>.
if (utf_ptr2len(p) == 1 || c == NUL) {
return 4;
@@ -540,16 +531,14 @@ int utf_ptr2cells(const char *p)
/// For an empty string or truncated character returns 1.
int utf_ptr2cells_len(const char *p, int size)
{
- int c;
-
// Need to convert to a wide character.
if (size > 0 && (uint8_t)(*p) >= 0x80) {
if (utf_ptr2len_len(p, size) < utf8len_tab[(uint8_t)(*p)]) {
return 1; // truncated
}
- c = utf_ptr2char((char *)p);
+ int c = utf_ptr2char(p);
// An illegal byte is displayed as <xx>.
- if (utf_ptr2len((char *)p) == 1 || c == NUL) {
+ if (utf_ptr2len(p) == 1 || c == NUL) {
return 4;
}
// If the char is ASCII it must be an overlong sequence.
@@ -662,34 +651,32 @@ int utf_ptr2char(const char *const p_in)
//
// If byte sequence is illegal or incomplete, returns -1 and does not advance
// "s".
-static int utf_safe_read_char_adv(const char_u **s, size_t *n)
+static int utf_safe_read_char_adv(const char **s, size_t *n)
{
- int c;
-
if (*n == 0) { // end of buffer
return 0;
}
- uint8_t k = utf8len_tab_zero[**s];
+ uint8_t k = utf8len_tab_zero[(uint8_t)(**s)];
if (k == 1) {
// ASCII character or NUL
(*n)--;
- return *(*s)++;
+ return (uint8_t)(*(*s)++);
}
if (k <= *n) {
// We have a multibyte sequence and it isn't truncated by buffer
// limits so utf_ptr2char() is safe to use. Or the first byte is
// illegal (k=0), and it's also safe to use utf_ptr2char().
- c = utf_ptr2char((char *)(*s));
+ int c = utf_ptr2char(*s);
// On failure, utf_ptr2char() returns the first byte, so here we
// check equality with the first byte. The only non-ASCII character
// which equals the first byte of its own UTF-8 representation is
// U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
// It's safe even if n=1, else we would have k=2 > n.
- if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
+ if (c != (int)((uint8_t)(**s)) || (c == 0xC3 && (uint8_t)(*s)[1] == 0x83)) {
// byte sequence was successfully decoded
*s += k;
*n -= k;
@@ -705,9 +692,7 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n)
// Note: composing characters are skipped!
int mb_ptr2char_adv(const char **const pp)
{
- int c;
-
- c = utf_ptr2char(*pp);
+ int c = utf_ptr2char(*pp);
*pp += utfc_ptr2len(*pp);
return c;
}
@@ -716,9 +701,7 @@ int mb_ptr2char_adv(const char **const pp)
// Note: composing characters are returned as separate characters.
int mb_cptr2char_adv(const char **pp)
{
- int c;
-
- c = utf_ptr2char(*pp);
+ int c = utf_ptr2char(*pp);
*pp += utf_ptr2len(*pp);
return c;
}
@@ -728,92 +711,78 @@ int mb_cptr2char_adv(const char **pp)
/// behaves like a composing character.
bool utf_composinglike(const char *p1, const char *p2)
{
- int c2;
-
- c2 = utf_ptr2char((char *)p2);
+ int c2 = utf_ptr2char(p2);
if (utf_iscomposing(c2)) {
return true;
}
if (!arabic_maycombine(c2)) {
return false;
}
- return arabic_combine(utf_ptr2char((char *)p1), c2);
+ return arabic_combine(utf_ptr2char(p1), c2);
}
-/// Convert a UTF-8 string to a wide character
+/// Get the screen char at the beginning of a string
+///
+/// Caller is expected to check for things like unprintable chars etc
+/// If first char in string is a composing char, prepend a space to display it correctly.
///
-/// Also gets up to #MAX_MCO composing characters.
+/// If "p" starts with an invalid sequence, zero is returned.
///
-/// @param[out] pcc Location where to store composing characters. Must have
-/// space at least for #MAX_MCO + 1 elements.
+/// @param[out] firstc (required) The first codepoint of the screen char,
+/// or the first byte of an invalid sequence
///
-/// @return leading character.
-int utfc_ptr2char(const char *p, int *pcc)
+/// @return the char
+schar_T utfc_ptr2schar(const char *p, int *firstc)
+ FUNC_ATTR_NONNULL_ALL
{
- int i = 0;
-
int c = utf_ptr2char(p);
- int len = utf_ptr2len(p);
+ *firstc = c; // NOT optional, you are gonna need it
+ bool first_compose = utf_iscomposing(c);
+ size_t maxlen = MAX_SCHAR_SIZE - 1 - first_compose;
+ size_t len = (size_t)utfc_ptr2len_len(p, (int)maxlen);
- // Only accept a composing char when the first char isn't illegal.
- if ((len > 1 || (uint8_t)(*p) < 0x80)
- && (uint8_t)p[len] >= 0x80
- && utf_composinglike(p, p + len)) {
- int cc = utf_ptr2char(p + len);
- for (;;) {
- pcc[i++] = cc;
- if (i == MAX_MCO) {
- break;
- }
- len += utf_ptr2len(p + len);
- if ((uint8_t)p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
- break;
- }
- }
- }
-
- if (i < MAX_MCO) { // last composing char must be 0
- pcc[i] = 0;
+ if (len == 1 && (uint8_t)(*p) >= 0x80) {
+ return 0; // invalid sequence
}
- return c;
+ return schar_from_buf_first(p, len, first_compose);
}
-// Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
-// composing characters. Use no more than p[maxlen].
-//
-// @param [out] pcc: composing chars, last one is 0
-int utfc_ptr2char_len(const char *p, int *pcc, int maxlen)
+/// Get the screen char at the beginning of a string with length
+///
+/// Like utfc_ptr2schar but use no more than p[maxlen].
+schar_T utfc_ptr2schar_len(const char *p, int maxlen, int *firstc)
+ FUNC_ATTR_NONNULL_ALL
{
assert(maxlen > 0);
- int i = 0;
+ size_t len = (size_t)utf_ptr2len_len(p, maxlen);
+ if (len > (size_t)maxlen || (len == 1 && (uint8_t)(*p) >= 0x80) || len == 0) {
+ // invalid or truncated sequence
+ *firstc = (uint8_t)(*p);
+ return 0;
+ }
- int len = utf_ptr2len_len(p, maxlen);
- // Is it safe to use utf_ptr2char()?
- bool safe = len > 1 && len <= maxlen;
- int c = safe ? utf_ptr2char(p) : (uint8_t)(*p);
+ int c = utf_ptr2char(p);
+ *firstc = c;
+ bool first_compose = utf_iscomposing(c);
+ maxlen = MIN(maxlen, MAX_SCHAR_SIZE - 1 - first_compose);
+ len = (size_t)utfc_ptr2len_len(p, maxlen);
- // Only accept a composing char when the first char isn't illegal.
- if ((safe || c < 0x80) && len < maxlen && (uint8_t)p[len] >= 0x80) {
- for (; i < MAX_MCO; i++) {
- int len_cc = utf_ptr2len_len(p + len, maxlen - len);
- safe = len_cc > 1 && len_cc <= maxlen - len;
- if (!safe || (pcc[i] = utf_ptr2char(p + len)) < 0x80
- || !(i == 0 ? utf_composinglike(p, p + len) : utf_iscomposing(pcc[i]))) {
- break;
- }
- len += len_cc;
- }
- }
+ return schar_from_buf_first(p, len, first_compose);
+}
- if (i < MAX_MCO) {
- // last composing char must be 0
- pcc[i] = 0;
+/// Caller must ensure there is space for `first_compose`
+static schar_T schar_from_buf_first(const char *buf, size_t len, bool first_compose)
+{
+ if (first_compose) {
+ char cbuf[MAX_SCHAR_SIZE];
+ cbuf[0] = ' ';
+ memcpy(cbuf + 1, buf, len);
+ return schar_from_buf(cbuf, len + 1);
+ } else {
+ return schar_from_buf(buf, len);
}
-
- return c;
-#undef ISCOMPOSING
}
/// Get the length of a UTF-8 byte sequence representing a single codepoint
@@ -854,11 +823,9 @@ int utf_byte2len(int b)
// Never returns zero.
int utf_ptr2len_len(const char *p, int size)
{
- int len;
- int i;
int m;
- len = utf8len_tab[(uint8_t)(*p)];
+ int len = utf8len_tab[(uint8_t)(*p)];
if (len == 1) {
return 1; // NUL, ascii or illegal lead byte
}
@@ -867,7 +834,7 @@ int utf_ptr2len_len(const char *p, int size)
} else {
m = len;
}
- for (i = 1; i < m; i++) {
+ for (int i = 1; i < m; i++) {
if ((p[i] & 0xc0) != 0x80) {
return 1;
}
@@ -898,10 +865,9 @@ int utfc_ptr2len(const char *const p)
return 1;
}
- // Check for composing characters. We can handle only the first six, but
- // skip all of them (otherwise the cursor would get stuck).
+ // Check for composing characters.
int prevlen = 0;
- for (;;) {
+ while (true) {
if ((uint8_t)p[len] < 0x80 || !utf_composinglike(p + prevlen, p + len)) {
return len;
}
@@ -918,9 +884,6 @@ int utfc_ptr2len(const char *const p)
/// Returns 1 for an illegal char or an incomplete byte sequence.
int utfc_ptr2len_len(const char *p, int size)
{
- int len;
- int prevlen;
-
if (size < 1 || *p == NUL) {
return 0;
}
@@ -929,7 +892,7 @@ int utfc_ptr2len_len(const char *p, int size)
}
// Skip over first UTF-8 char, stopping at a NUL byte.
- len = utf_ptr2len_len(p, size);
+ int len = utf_ptr2len_len(p, size);
// Check for illegal byte and incomplete byte sequence.
if ((len == 1 && (uint8_t)p[0] >= 0x80) || len > size) {
@@ -938,17 +901,15 @@ int utfc_ptr2len_len(const char *p, int size)
// Check for composing characters. We can handle only the first six, but
// skip all of them (otherwise the cursor would get stuck).
- prevlen = 0;
+ int prevlen = 0;
while (len < size) {
- int len_next_char;
-
if ((uint8_t)p[len] < 0x80) {
break;
}
// Next character length should not go beyond size to ensure that
// utf_composinglike(...) does not read beyond size.
- len_next_char = utf_ptr2len_len(p + len, size - len);
+ int len_next_char = utf_ptr2len_len(p + len, size - len);
if (len_next_char > size - len) {
break;
}
@@ -1063,9 +1024,9 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
{
// sorted list of non-overlapping intervals
static struct clinterval {
- unsigned int first;
- unsigned int last;
- unsigned int class;
+ unsigned first;
+ unsigned last;
+ unsigned cls;
} classes[] = {
{ 0x037e, 0x037e, 1 }, // Greek question mark
{ 0x0387, 0x0387, 1 }, // Greek ano teleia
@@ -1141,7 +1102,6 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
};
int bot = 0;
int top = ARRAY_SIZE(classes) - 1;
- int mid;
// First quick check for Latin1 characters, use 'iskeyword'.
if (c < 0x100) {
@@ -1161,13 +1121,13 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
// binary search in table
while (top >= bot) {
- mid = (bot + top) / 2;
- if (classes[mid].last < (unsigned int)c) {
+ int mid = (bot + top) / 2;
+ if (classes[mid].last < (unsigned)c) {
bot = mid + 1;
- } else if (classes[mid].first > (unsigned int)c) {
+ } else if (classes[mid].first > (unsigned)c) {
top = mid - 1;
} else {
- return (int)classes[mid].class;
+ return (int)classes[mid].cls;
}
}
@@ -1186,13 +1146,12 @@ bool utf_ambiguous_width(int c)
// the given conversion "table". Uses binary search on "table".
static int utf_convert(int a, const convertStruct *const table, size_t n_items)
{
- size_t start, mid, end; // indices into table
-
- start = 0;
- end = n_items;
+ // indices into table
+ size_t start = 0;
+ size_t end = n_items;
while (start < end) {
// need to search further
- mid = (end + start) / 2;
+ size_t mid = (end + start) / 2;
if (table[mid].rangeEnd < a) {
start = mid + 1;
} else {
@@ -1285,12 +1244,12 @@ bool mb_isalpha(int a)
return mb_islower(a) || mb_isupper(a);
}
-static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2)
+static int utf_strnicmp(const char *s1, const char *s2, size_t n1, size_t n2)
{
- int c1, c2, cdiff;
+ int c1, c2;
char buffer[6];
- for (;;) {
+ while (true) {
c1 = utf_safe_read_char_adv(&s1, &n1);
c2 = utf_safe_read_char_adv(&s2, &n2);
@@ -1302,7 +1261,7 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2
continue;
}
- cdiff = utf_fold(c1) - utf_fold(c2);
+ int cdiff = utf_fold(c1) - utf_fold(c2);
if (cdiff != 0) {
return cdiff;
}
@@ -1326,15 +1285,15 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2
// to fold just one character to determine the result of comparison.
if (c1 != -1 && c2 == -1) {
- n1 = (size_t)utf_char2bytes(utf_fold(c1), (char *)buffer);
- s1 = (char_u *)buffer;
+ n1 = (size_t)utf_char2bytes(utf_fold(c1), buffer);
+ s1 = buffer;
} else if (c2 != -1 && c1 == -1) {
- n2 = (size_t)utf_char2bytes(utf_fold(c2), (char *)buffer);
- s2 = (char_u *)buffer;
+ n2 = (size_t)utf_char2bytes(utf_fold(c2), buffer);
+ s2 = buffer;
}
while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) {
- cdiff = (int)(*s1) - (int)(*s2);
+ int cdiff = (int)((uint8_t)(*s1)) - (int)((uint8_t)(*s2));
if (cdiff != 0) {
return cdiff;
}
@@ -1483,11 +1442,11 @@ ssize_t mb_utf_index_to_bytes(const char *s, size_t len, size_t index, bool use_
FUNC_ATTR_NONNULL_ALL
{
size_t count = 0;
- size_t clen, i;
+ size_t clen;
if (index == 0) {
return 0;
}
- for (i = 0; i < len; i += clen) {
+ for (size_t i = 0; i < len; i += clen) {
clen = (size_t)utf_ptr2len_len(s + i, (int)(len - i));
// NB: gets the byte value of invalid sequence bytes.
// we only care whether the char fits in the BMP or not
@@ -1512,7 +1471,7 @@ ssize_t mb_utf_index_to_bytes(const char *s, size_t len, size_t index, bool use_
/// two characters otherwise.
int mb_strnicmp(const char *s1, const char *s2, const size_t nn)
{
- return utf_strnicmp((char_u *)s1, (char_u *)s2, nn, nn);
+ return utf_strnicmp(s1, s2, nn, nn);
}
/// Compare strings case-insensitively
@@ -1536,23 +1495,18 @@ int mb_stricmp(const char *s1, const char *s2)
// 'encoding' has been set to.
void show_utf8(void)
{
- int len;
- int rlen = 0;
- char *line;
- int clen;
- int i;
-
// Get the byte length of the char under the cursor, including composing
// characters.
- line = get_cursor_pos_ptr();
- len = utfc_ptr2len(line);
+ char *line = get_cursor_pos_ptr();
+ int len = utfc_ptr2len(line);
if (len == 0) {
- msg("NUL");
+ msg("NUL", 0);
return;
}
- clen = 0;
- for (i = 0; i < len; i++) {
+ size_t rlen = 0;
+ int clen = 0;
+ for (int i = 0; i < len; i++) {
if (clen == 0) {
// start of (composing) character, get its length
if (i > 0) {
@@ -1561,16 +1515,17 @@ void show_utf8(void)
}
clen = utf_ptr2len(line + i);
}
- sprintf(IObuff + rlen, "%02x ", // NOLINT(runtime/printf)
- (line[i] == NL) ? NUL : (uint8_t)line[i]); // NUL is stored as NL
+ assert(IOSIZE > rlen);
+ snprintf(IObuff + rlen, IOSIZE - rlen, "%02x ",
+ (line[i] == NL) ? NUL : (uint8_t)line[i]); // NUL is stored as NL
clen--;
- rlen += (int)strlen(IObuff + rlen);
+ rlen += strlen(IObuff + rlen);
if (rlen > IOSIZE - 20) {
break;
}
}
- msg(IObuff);
+ msg(IObuff, 0);
}
/// Return offset from "p" to the start of a character, including composing characters.
@@ -1579,9 +1534,6 @@ void show_utf8(void)
/// Returns 0 when already at the first byte of a character.
int utf_head_off(const char *base_in, const char *p_in)
{
- int c;
- int len;
-
if ((uint8_t)(*p_in) < 0x80) { // be quick for ASCII
return 0;
}
@@ -1603,7 +1555,7 @@ int utf_head_off(const char *base_in, const char *p_in)
}
// Check for illegal sequence. Do allow an illegal byte after where we
// started.
- len = utf8len_tab[*q];
+ int len = utf8len_tab[*q];
if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
return 0;
}
@@ -1612,7 +1564,7 @@ int utf_head_off(const char *base_in, const char *p_in)
break;
}
- c = utf_ptr2char((char *)q);
+ int c = utf_ptr2char((char *)q);
if (utf_iscomposing(c)) {
continue;
}
@@ -1669,7 +1621,7 @@ bool utf_allow_break_before(int cc)
0x2021, // ‡ double dagger
0x2026, // … horizontal ellipsis
0x2030, // ‰ per mille sign
- 0x2031, // ‱ per then thousand sign
+ 0x2031, // ‱ per the thousand sign
0x203c, // ‼ double exclamation mark
0x2047, // ⁇ double question mark
0x2048, // ⁈ question exclamation mark
@@ -1795,7 +1747,6 @@ int mb_off_next(const char *base, const char *p_in)
{
const uint8_t *p = (uint8_t *)p_in;
int i;
- int j;
if (*p < 0x80) { // be quick for ASCII
return 0;
@@ -1804,6 +1755,7 @@ int mb_off_next(const char *base, const char *p_in)
// Find the next character that isn't 10xx.xxxx
for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
if (i > 0) {
+ int j;
// Check for illegal sequence.
for (j = 0; p - j > (uint8_t *)base; j++) {
if ((p[-j] & 0xc0) != 0x80) {
@@ -1849,33 +1801,35 @@ int utf_cp_tail_off(const char *base, const char *p_in)
/// Return the offset from "p" to the first byte of the codepoint it points
/// to. Can start anywhere in a stream of bytes.
/// Note: Unlike `utf_head_off`, this counts individual codepoints of composed characters
-/// separately and returns a negative offset.
+/// separately.
///
/// @param[in] base Pointer to start of string
/// @param[in] p Pointer to byte for which to return the offset to the previous codepoint
//
-/// @return 0 if invalid sequence, else offset to previous codepoint
-int utf_cp_head_off(const char_u *base, const char_u *p)
+/// @return 0 if invalid sequence, else number of bytes to previous codepoint
+int utf_cp_head_off(const char *base, const char *p)
{
int i;
- int j;
if (*p == NUL) {
return 0;
}
// Find the first character that is not 10xx.xxxx
- for (i = 0; p - i > base; i--) {
- if ((p[i] & 0xc0) != 0x80) {
+ for (i = 0; p - i >= base; i++) {
+ if (((uint8_t)p[-i] & 0xc0) != 0x80) {
break;
}
}
- // Find the last character that is 10xx.xxxx
- for (j = 0; (p[j + 1] & 0xc0) == 0x80; j++) {}
+ // Find the last character that is 10xx.xxxx (condition terminates on NUL)
+ int j = 1;
+ while (((uint8_t)p[j] & 0xc0) == 0x80) {
+ j++;
+ }
// Check for illegal sequence.
- if (utf8len_tab[p[i]] == 1) {
+ if (utf8len_tab[(uint8_t)p[-i]] != j + i) {
return 0;
}
return i;
@@ -1885,8 +1839,6 @@ int utf_cp_head_off(const char_u *base, const char_u *p)
void utf_find_illegal(void)
{
pos_T pos = curwin->w_cursor;
- char *p;
- int len;
vimconv_T vimconv;
char *tofree = NULL;
@@ -1899,8 +1851,8 @@ void utf_find_illegal(void)
}
curwin->w_cursor.coladd = 0;
- for (;;) {
- p = get_cursor_pos_ptr();
+ while (true) {
+ char *p = get_cursor_pos_ptr();
if (vimconv.vc_type != CONV_NONE) {
xfree(tofree);
tofree = string_convert(&vimconv, p, NULL);
@@ -1913,7 +1865,7 @@ void utf_find_illegal(void)
while (*p != NUL) {
// Illegal means that there are not enough trail bytes (checked by
// utf_ptr2len()) or too many of them (overlong sequence).
- len = utf_ptr2len(p);
+ int len = utf_ptr2len(p);
if ((uint8_t)(*p) >= 0x80 && (len == 1 || utf_char2len(utf_ptr2char(p)) != len)) {
if (vimconv.vc_type == CONV_NONE) {
curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
@@ -1948,16 +1900,16 @@ theend:
/// @return true if string "s" is a valid utf-8 string.
/// When "end" is NULL stop at the first NUL. Otherwise stop at "end".
-bool utf_valid_string(const char_u *s, const char_u *end)
+bool utf_valid_string(const char *s, const char *end)
{
- const char_u *p = s;
+ const uint8_t *p = (uint8_t *)s;
- while (end == NULL ? *p != NUL : p < end) {
+ while (end == NULL ? *p != NUL : p < (uint8_t *)end) {
int l = utf8len_tab_zero[*p];
if (l == 0) {
return false; // invalid lead byte
}
- if (end != NULL && p + l > end) {
+ if (end != NULL && p + l > (uint8_t *)end) {
return false; // incomplete byte sequence
}
p++;
@@ -1988,7 +1940,7 @@ void mb_check_adjust_col(void *win_)
// Column 0 is always valid.
if (oldcol != 0) {
- char *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
+ char *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum);
colnr_T len = (colnr_T)strlen(p);
// Empty line or invalid column?
@@ -2042,6 +1994,24 @@ int mb_charlen(const char *str)
return count;
}
+int mb_charlen2bytelen(const char *str, int charlen)
+{
+ const char *p = str;
+ int count = 0;
+
+ if (p == NULL) {
+ return 0;
+ }
+
+ for (int i = 0; *p != NUL && i < charlen; i++) {
+ int b = utfc_ptr2len(p);
+ p += b;
+ count += b;
+ }
+
+ return count;
+}
+
/// Like mb_charlen() but for a string with specified length.
int mb_charlen_len(const char *str, int len)
{
@@ -2122,7 +2092,6 @@ char *enc_skip(char *p)
char *enc_canonize(char *enc)
FUNC_ATTR_NONNULL_RET
{
- char *p, *s;
if (strcmp(enc, "default") == 0) {
// Use the default encoding as found by set_init_1().
return xstrdup(fenc_default);
@@ -2131,8 +2100,8 @@ char *enc_canonize(char *enc)
// copy "enc" to allocated memory, with room for two '-'
char *r = xmalloc(strlen(enc) + 3);
// Make it all lower case and replace '_' with '-'.
- p = r;
- for (s = enc; *s != NUL; s++) {
+ char *p = r;
+ for (char *s = enc; *s != NUL; s++) {
if (*s == '_') {
*p++ = '-';
} else {
@@ -2184,9 +2153,7 @@ char *enc_canonize(char *enc)
/// Returns -1 when not found.
static int enc_alias_search(const char *name)
{
- int i;
-
- for (i = 0; enc_alias_table[i].name != NULL; i++) {
+ for (int i = 0; enc_alias_table[i].name != NULL; i++) {
if (strcmp(name, enc_alias_table[i].name) == 0) {
return enc_alias_table[i].canon;
}
@@ -2210,10 +2177,7 @@ char *enc_locale(void)
if (!(s = nl_langinfo(CODESET)) || *s == NUL)
#endif
{
-#if defined(HAVE_LOCALE_H)
- if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL)
-#endif
- {
+ if (!(s = setlocale(LC_CTYPE, NULL)) || *s == NUL) {
if ((s = os_getenv("LC_ALL"))) {
if ((s = os_getenv("LC_CTYPE"))) {
s = os_getenv("LANG");
@@ -2269,17 +2233,14 @@ enc_locale_copy_enc:
// (should return iconv_t, but that causes problems with prototypes).
void *my_iconv_open(char *to, char *from)
{
- iconv_t fd;
#define ICONV_TESTLEN 400
char tobuf[ICONV_TESTLEN];
- char *p;
- size_t tolen;
static WorkingStatus iconv_working = kUnknown;
if (iconv_working == kBroken) {
return (void *)-1; // detected a broken iconv() previously
}
- fd = iconv_open(enc_skip(to), enc_skip(from));
+ iconv_t fd = iconv_open(enc_skip(to), enc_skip(from));
if (fd != (iconv_t)-1 && iconv_working == kUnknown) {
// Do a dummy iconv() call to check if it actually works. There is a
@@ -2287,8 +2248,8 @@ void *my_iconv_open(char *to, char *from)
// because it's wide-spread. The symptoms are that after outputting
// the initial shift state the "to" pointer is NULL and conversion
// stops for no apparent reason after about 8160 characters.
- p = tobuf;
- tolen = ICONV_TESTLEN;
+ char *p = tobuf;
+ size_t tolen = ICONV_TESTLEN;
(void)iconv(fd, NULL, NULL, &p, &tolen);
if (p == NULL) {
iconv_working = kBroken;
@@ -2310,24 +2271,19 @@ void *my_iconv_open(char *to, char *from)
static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t slen,
size_t *unconvlenp, size_t *resultlenp)
{
- const char *from;
- size_t fromlen;
char *to;
- size_t tolen;
size_t len = 0;
size_t done = 0;
char *result = NULL;
- char *p;
- int l;
- from = str;
- fromlen = slen;
- for (;;) {
+ const char *from = str;
+ size_t fromlen = slen;
+ while (true) {
if (len == 0 || ICONV_ERRNO == ICONV_E2BIG) {
// Allocate enough room for most conversions. When re-allocating
// increase the buffer size.
len = len + fromlen * 2 + 40;
- p = xmalloc(len);
+ char *p = xmalloc(len);
if (done > 0) {
memmove(p, result, done);
}
@@ -2336,7 +2292,7 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl
}
to = result + done;
- tolen = len - done - 2;
+ size_t tolen = len - done - 2;
// Avoid a warning for systems with a wrong iconv() prototype by
// casting the second argument to void *.
if (iconv(vcp->vc_fd, (void *)&from, &fromlen, &to, &tolen) != SIZE_MAX) {
@@ -2366,7 +2322,7 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl
if (utf_ptr2cells(from) > 1) {
*to++ = '?';
}
- l = utfc_ptr2len_len(from, (int)fromlen);
+ int l = utfc_ptr2len_len(from, (int)fromlen);
from += l;
fromlen -= (size_t)l;
} else if (ICONV_ERRNO != ICONV_E2BIG) {
@@ -2384,6 +2340,34 @@ static char *iconv_string(const vimconv_T *const vcp, const char *str, size_t sl
return result;
}
+/// iconv() function
+void f_iconv(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
+{
+ vimconv_T vimconv;
+
+ rettv->v_type = VAR_STRING;
+ rettv->vval.v_string = NULL;
+
+ const char *const str = tv_get_string(&argvars[0]);
+ char buf1[NUMBUFLEN];
+ char *const from = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[1], buf1)));
+ char buf2[NUMBUFLEN];
+ char *const to = enc_canonize(enc_skip((char *)tv_get_string_buf(&argvars[2], buf2)));
+ vimconv.vc_type = CONV_NONE;
+ convert_setup(&vimconv, from, to);
+
+ // If the encodings are equal, no conversion needed.
+ if (vimconv.vc_type == CONV_NONE) {
+ rettv->vval.v_string = xstrdup(str);
+ } else {
+ rettv->vval.v_string = string_convert(&vimconv, (char *)str, NULL);
+ }
+
+ convert_setup(&vimconv, NULL, NULL);
+ xfree(from);
+ xfree(to);
+}
+
/// Setup "vcp" for conversion from "from" to "to".
/// The names must have been made canonical with enc_canonize().
/// vcp->vc_type must have been initialized to CONV_NONE.
@@ -2402,8 +2386,6 @@ int convert_setup(vimconv_T *vcp, char *from, char *to)
int convert_setup_ext(vimconv_T *vcp, char *from, bool from_unicode_is_utf8, char *to,
bool to_unicode_is_utf8)
{
- int from_prop;
- int to_prop;
int from_is_utf8;
int to_is_utf8;
@@ -2419,8 +2401,8 @@ int convert_setup_ext(vimconv_T *vcp, char *from, bool from_unicode_is_utf8, cha
return OK;
}
- from_prop = enc_canon_props(from);
- to_prop = enc_canon_props(to);
+ int from_prop = enc_canon_props(from);
+ int to_prop = enc_canon_props(to);
if (from_unicode_is_utf8) {
from_is_utf8 = from_prop & ENC_UNICODE;
} else {
@@ -2477,9 +2459,8 @@ char *string_convert(const vimconv_T *const vcp, char *ptr, size_t *lenp)
// set to the number of remaining bytes.
char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, size_t *unconvlenp)
{
- char_u *retval = NULL;
- char_u *d;
- int l;
+ uint8_t *retval = NULL;
+ uint8_t *d;
int c;
size_t len;
@@ -2499,10 +2480,10 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
for (size_t i = 0; i < len; i++) {
c = (uint8_t)ptr[i];
if (c < 0x80) {
- *d++ = (char_u)c;
+ *d++ = (uint8_t)c;
} else {
- *d++ = (char_u)(0xc0 + (char_u)((unsigned)c >> 6));
- *d++ = (char_u)(0x80 + (c & 0x3f));
+ *d++ = (uint8_t)(0xc0 + (uint8_t)((unsigned)c >> 6));
+ *d++ = (uint8_t)(0x80 + (c & 0x3f));
}
}
*d = NUL;
@@ -2547,7 +2528,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
retval = xmalloc(len + 1);
d = retval;
for (size_t i = 0; i < len; i++) {
- l = utf_ptr2len_len(ptr + i, (int)(len - i));
+ int l = utf_ptr2len_len(ptr + i, (int)(len - i));
if (l == 0) {
*d++ = NUL;
} else if (l == 1) {
@@ -2597,7 +2578,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
}
if (!utf_iscomposing(c)) { // skip composing chars
if (c < 0x100) {
- *d++ = (char_u)c;
+ *d++ = (uint8_t)c;
} else if (vcp->vc_fail) {
xfree(retval);
return NULL;
@@ -2618,7 +2599,7 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
break;
case CONV_ICONV: // conversion with vcp->vc_fd
- retval = (char_u *)iconv_string(vcp, ptr, len, unconvlenp, lenp);
+ retval = (uint8_t *)iconv_string(vcp, ptr, len, unconvlenp, lenp);
break;
}
@@ -2627,8 +2608,8 @@ char *string_convert_ext(const vimconv_T *const vcp, char *ptr, size_t *lenp, si
/// Table set by setcellwidths().
typedef struct {
- long first;
- long last;
+ int64_t first;
+ int64_t last;
char width;
} cw_interval_T;
@@ -2753,7 +2734,7 @@ void f_setcellwidths(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
const listitem_T *lili = tv_list_first(li_l);
const varnumber_T n1 = TV_LIST_ITEM_TV(lili)->vval.v_number;
if (item > 0 && n1 <= table[item - 1].last) {
- semsg(_(e_overlapping_ranges_for_nr), (long)n1);
+ semsg(_(e_overlapping_ranges_for_nr), (size_t)n1);
xfree((void *)ptrs);
xfree(table);
return;
@@ -2810,3 +2791,14 @@ void f_charclass(typval_T *argvars, typval_T *rettv, EvalFuncData fptr)
}
rettv->vval.v_number = mb_get_class(argvars[0].vval.v_string);
}
+
+/// Function given to ExpandGeneric() to obtain the possible arguments of the
+/// encoding options.
+char *get_encoding_name(expand_T *xp FUNC_ATTR_UNUSED, int idx)
+{
+ if (idx >= (int)ARRAY_SIZE(enc_canon_table)) {
+ return NULL;
+ }
+
+ return (char *)enc_canon_table[idx].name;
+}