aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c1680
1 files changed, 557 insertions, 1123 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index 0ba9f8b076..ead6b4405d 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1,68 +1,30 @@
-/*
- * mbyte.c: Code specifically for handling multi-byte characters.
- * Multibyte extensions partly by Sung-Hoon Baek
- *
- * The encoding used in the core is set with 'encoding'. When 'encoding' is
- * changed, the following four variables are set (for speed).
- * Currently these types of character encodings are supported:
- *
- * "enc_dbcs" When non-zero it tells the type of double byte character
- * encoding (Chinese, Korean, Japanese, etc.).
- * The cell width on the display is equal to the number of
- * bytes. (exception: DBCS_JPNU with first byte 0x8e)
- * Recognizing the first or second byte is difficult, it
- * requires checking a byte sequence from the start.
- * "enc_utf8" When TRUE use Unicode characters in UTF-8 encoding.
- * The cell width on the display needs to be determined from
- * the character value.
- * Recognizing bytes is easy: 0xxx.xxxx is a single-byte
- * char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading
- * byte of a multi-byte character.
- * To make things complicated, up to six composing characters
- * are allowed. These are drawn on top of the first char.
- * For most editing the sequence of bytes with composing
- * characters included is considered to be one character.
- * "enc_unicode" When 2 use 16-bit Unicode characters (or UTF-16).
- * When 4 use 32-but Unicode characters.
- * Internally characters are stored in UTF-8 encoding to
- * avoid NUL bytes. Conversion happens when doing I/O.
- * "enc_utf8" will also be TRUE.
- *
- * "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero.
- *
- * If none of these is TRUE, 8-bit bytes are used for a character. The
- * encoding isn't currently specified (TODO).
- *
- * 'encoding' specifies the encoding used in the core. This is in registers,
- * text manipulation, buffers, etc. Conversion has to be done when characters
- * in another encoding are received or send:
- *
- * clipboard
- * ^
- * | (2)
- * V
- * +---------------+
- * (1) | | (3)
- * keyboard ----->| core |-----> display
- * | |
- * +---------------+
- * ^
- * | (4)
- * V
- * file
- *
- * (1) Typed characters arrive in the current locale.
- * (2) Text will be made available with the encoding specified with
- * 'encoding'. If this is not sufficient, system-specific conversion
- * might be required.
- * (3) For the GUI the correct font must be selected, no conversion done.
- * (4) The encoding of the file is specified with 'fileencoding'. Conversion
- * is to be done when it's different from 'encoding'.
- *
- * The ShaDa file is a special case: Only text is converted, not file names.
- * Vim scripts may contain an ":encoding" command. This has an effect for
- * some commands, like ":menutrans"
- */
+// This is an open source non-commercial project. Dear PVS-Studio, please check
+// it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
+
+/// mbyte.c: Code specifically for handling multi-byte characters.
+/// Multibyte extensions partly by Sung-Hoon Baek
+///
+/// The encoding used in nvim is always UTF-8. "enc_utf8" and "has_mbyte" is
+/// thus always true. "enc_dbcs" is always zero. The 'encoding' option is
+/// read-only and always reads "utf-8".
+///
+/// The cell width on the display needs to be determined from the character
+/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
+/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
+/// character. To make things complicated, up to six composing characters
+/// are allowed. These are drawn on top of the first char. For most editing
+/// the sequence of bytes with composing characters included is considered to
+/// be one character.
+///
+/// UTF-8 is used everywhere in the core. This is in registers, text
+/// manipulation, buffers, etc. Nvim core communicates with external plugins
+/// and GUIs in this encoding.
+///
+/// The encoding of a file is specified with 'fileencoding'. Conversion
+/// is to be done when it's different from "utf-8".
+///
+/// Vim scripts may contain an ":scriptencoding" command. This has an effect
+/// for some commands, like ":menutrans".
#include <inttypes.h>
#include <stdbool.h>
@@ -75,6 +37,8 @@
#ifdef HAVE_LOCALE_H
# include <locale.h>
#endif
+#include "nvim/eval.h"
+#include "nvim/path.h"
#include "nvim/iconv.h"
#include "nvim/mbyte.h"
#include "nvim/charset.h"
@@ -84,7 +48,6 @@
#include "nvim/memline.h"
#include "nvim/message.h"
#include "nvim/misc1.h"
-#include "nvim/misc2.h"
#include "nvim/memory.h"
#include "nvim/option.h"
#include "nvim/screen.h"
@@ -92,6 +55,7 @@
#include "nvim/strings.h"
#include "nvim/os/os.h"
#include "nvim/arabic.h"
+#include "nvim/mark.h"
typedef struct {
int rangeStart;
@@ -110,37 +74,52 @@ struct interval {
# include "unicode_tables.generated.h"
#endif
-/*
- * Lookup table to quickly get the length in bytes of a UTF-8 character from
- * the first byte of a UTF-8 string.
- * Bytes which are illegal when used as the first byte have a 1.
- * The NUL byte has length 1.
- */
-static char utf8len_tab[256] =
-{
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
+char_u e_loadlib[] = "E370: Could not load library %s";
+char_u e_loadfunc[] = "E448: Could not load library function %s";
+
+// To speed up BYTELEN(); keep a lookup table to quickly get the length in
+// bytes of a UTF-8 character from the first byte of a UTF-8 string. Bytes
+// which are illegal when used as the first byte have a 1. The NUL byte has
+// length 1.
+const uint8_t utf8len_tab[] = {
+ // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 8?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 9?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // A?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // B?
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C?
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D?
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E?
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1, // F?
};
-/*
- * Like utf8len_tab above, but using a zero for illegal lead bytes.
- */
-static uint8_t utf8len_tab_zero[256] =
-{
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,0,0,
+// Like utf8len_tab above, but using a zero for illegal lead bytes.
+const uint8_t utf8len_tab_zero[] = {
+ // ?1 ?2 ?3 ?4 ?5 ?6 ?7 ?8 ?9 ?A ?B ?C ?D ?E ?F
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6?
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7?
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8?
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9?
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A?
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B?
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C?
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D?
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E?
+ 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, // F?
};
/*
@@ -386,207 +365,6 @@ int enc_canon_props(const char_u *name)
}
/*
- * Set up for using multi-byte characters.
- * Called in three cases:
- * - by main() to initialize (p_enc == NULL)
- * - by set_init_1() after 'encoding' was set to its default.
- * - by do_set() when 'encoding' has been set.
- * p_enc must have been passed through enc_canonize() already.
- * Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags.
- * Fills mb_bytelen_tab[] and returns NULL when there are no problems.
- * When there is something wrong: Returns an error message and doesn't change
- * anything.
- */
-char_u * mb_init(void)
-{
- int i;
- int idx;
- int n;
- int enc_dbcs_new = 0;
-#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \
- && !defined(MACOS)
-# define LEN_FROM_CONV
- vimconv_T vimconv;
- char_u *p;
-#endif
-
- if (p_enc == NULL) {
- /* Just starting up: set the whole table to one's. */
- for (i = 0; i < 256; ++i)
- mb_bytelen_tab[i] = 1;
- return NULL;
- } else if (STRNCMP(p_enc, "8bit-", 5) == 0
- || STRNCMP(p_enc, "iso-8859-", 9) == 0) {
- /* Accept any "8bit-" or "iso-8859-" name. */
- enc_unicode = 0;
- enc_utf8 = false;
- } else if (STRNCMP(p_enc, "2byte-", 6) == 0) {
- /* Unix: accept any "2byte-" name, assume current locale. */
- enc_dbcs_new = DBCS_2BYTE;
- } else if ((idx = enc_canon_search(p_enc)) >= 0) {
- i = enc_canon_table[idx].prop;
- if (i & ENC_UNICODE) {
- /* Unicode */
- enc_utf8 = true;
- if (i & (ENC_2BYTE | ENC_2WORD))
- enc_unicode = 2;
- else if (i & ENC_4BYTE)
- enc_unicode = 4;
- else
- enc_unicode = 0;
- } else if (i & ENC_DBCS) {
- /* 2byte, handle below */
- enc_dbcs_new = enc_canon_table[idx].codepage;
- } else {
- /* Must be 8-bit. */
- enc_unicode = 0;
- enc_utf8 = false;
- }
- } else /* Don't know what encoding this is, reject it. */
- return e_invarg;
-
- if (enc_dbcs_new != 0) {
- enc_unicode = 0;
- enc_utf8 = false;
- }
- enc_dbcs = enc_dbcs_new;
- has_mbyte = (enc_dbcs != 0 || enc_utf8);
-
-
- /* Detect an encoding that uses latin1 characters. */
- enc_latin1like = (enc_utf8 || STRCMP(p_enc, "latin1") == 0
- || STRCMP(p_enc, "iso-8859-15") == 0);
-
- /*
- * Set the function pointers.
- */
- if (enc_utf8) {
- mb_ptr2len = utfc_ptr2len;
- mb_ptr2len_len = utfc_ptr2len_len;
- mb_char2len = utf_char2len;
- mb_char2bytes = utf_char2bytes;
- mb_ptr2cells = utf_ptr2cells;
- mb_ptr2cells_len = utf_ptr2cells_len;
- mb_char2cells = utf_char2cells;
- mb_off2cells = utf_off2cells;
- mb_ptr2char = utf_ptr2char;
- mb_head_off = utf_head_off;
- } else if (enc_dbcs != 0) {
- mb_ptr2len = dbcs_ptr2len;
- mb_ptr2len_len = dbcs_ptr2len_len;
- mb_char2len = dbcs_char2len;
- mb_char2bytes = dbcs_char2bytes;
- mb_ptr2cells = dbcs_ptr2cells;
- mb_ptr2cells_len = dbcs_ptr2cells_len;
- mb_char2cells = dbcs_char2cells;
- mb_off2cells = dbcs_off2cells;
- mb_ptr2char = dbcs_ptr2char;
- mb_head_off = dbcs_head_off;
- } else {
- mb_ptr2len = latin_ptr2len;
- mb_ptr2len_len = latin_ptr2len_len;
- mb_char2len = latin_char2len;
- mb_char2bytes = latin_char2bytes;
- mb_ptr2cells = latin_ptr2cells;
- mb_ptr2cells_len = latin_ptr2cells_len;
- mb_char2cells = latin_char2cells;
- mb_off2cells = latin_off2cells;
- mb_ptr2char = latin_ptr2char;
- mb_head_off = latin_head_off;
- }
-
- /*
- * Fill the mb_bytelen_tab[] for MB_BYTE2LEN().
- */
-#ifdef LEN_FROM_CONV
- /* When 'encoding' is different from the current locale mblen() won't
- * work. Use conversion to "utf-8" instead. */
- vimconv.vc_type = CONV_NONE;
- if (enc_dbcs) {
- p = enc_locale();
- if (p == NULL || STRCMP(p, p_enc) != 0) {
- convert_setup(&vimconv, p_enc, (char_u *)"utf-8");
- vimconv.vc_fail = true;
- }
- xfree(p);
- }
-#endif
-
- for (i = 0; i < 256; ++i) {
- /* Our own function to reliably check the length of UTF-8 characters,
- * independent of mblen(). */
- if (enc_utf8)
- n = utf8len_tab[i];
- else if (enc_dbcs == 0)
- n = 1;
- else {
- char buf[MB_MAXBYTES + 1];
- if (i == NUL) /* just in case mblen() can't handle "" */
- n = 1;
- else {
- buf[0] = i;
- buf[1] = 0;
-#ifdef LEN_FROM_CONV
- if (vimconv.vc_type != CONV_NONE) {
- /*
- * string_convert() should fail when converting the first
- * byte of a double-byte character.
- */
- p = string_convert(&vimconv, (char_u *)buf, NULL);
- if (p != NULL) {
- xfree(p);
- n = 1;
- } else
- n = 2;
- } else
-#endif
- {
- /*
- * mblen() should return -1 for invalid (means the leading
- * multibyte) character. However there are some platforms
- * where mblen() returns 0 for invalid character.
- * Therefore, following condition includes 0.
- */
- ignored = mblen(NULL, 0); /* First reset the state. */
- if (mblen(buf, (size_t)1) <= 0)
- n = 2;
- else
- n = 1;
- }
- }
- }
- mb_bytelen_tab[i] = n;
- }
-
-#ifdef LEN_FROM_CONV
- convert_setup(&vimconv, NULL, NULL);
-#endif
-
- /* The cell width depends on the type of multi-byte characters. */
- (void)init_chartab();
-
- /* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */
- screenalloc(false);
-
-#ifdef HAVE_WORKING_LIBINTL
- /* GNU gettext 0.10.37 supports this feature: set the codeset used for
- * translated messages independently from the current locale. */
- (void)bind_textdomain_codeset(PROJECT_NAME,
- enc_utf8 ? "utf-8" : (char *)p_enc);
-#endif
-
-
- /* Fire an autocommand to let people do custom font setup. This must be
- * after Vim has been setup for the new encoding. */
- apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf);
-
- /* Need to reload spell dictionaries */
- spell_reload();
-
- return NULL;
-}
-
-/*
* Return the size of the BOM for the current buffer:
* 0 - no BOM
* 2 - UCS-2 or UTF-16 BOM
@@ -598,20 +376,15 @@ int bomb_size(void)
int n = 0;
if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
- if (*curbuf->b_p_fenc == NUL) {
- if (enc_utf8) {
- if (enc_unicode != 0)
- n = enc_unicode;
- else
- n = 3;
- }
- } else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0)
+ if (*curbuf->b_p_fenc == NUL
+ || STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
n = 3;
- else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
- || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0)
+ } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
+ || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
n = 2;
- else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0)
+ } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
n = 4;
+ }
}
return n;
}
@@ -621,14 +394,13 @@ int bomb_size(void)
*/
void remove_bom(char_u *s)
{
- if (enc_utf8) {
- char_u *p = s;
+ char *p = (char *)s;
- while ((p = vim_strbyte(p, 0xef)) != NULL) {
- if (p[1] == 0xbb && p[2] == 0xbf)
- STRMOVE(p, p + 3);
- else
- ++p;
+ while ((p = strchr(p, 0xef)) != NULL) {
+ if ((uint8_t)p[1] == 0xbb && (uint8_t)p[2] == 0xbf) {
+ STRMOVE(p, p + 3);
+ } else {
+ p++;
}
}
}
@@ -642,259 +414,21 @@ void remove_bom(char_u *s)
*/
int mb_get_class(const char_u *p)
{
- return mb_get_class_buf(p, curbuf);
+ return mb_get_class_tab(p, curbuf->b_chartab);
}
-int mb_get_class_buf(const char_u *p, buf_T *buf)
+int mb_get_class_tab(const char_u *p, const uint64_t *const chartab)
{
if (MB_BYTE2LEN(p[0]) == 1) {
- if (p[0] == NUL || ascii_iswhite(p[0]))
+ if (p[0] == NUL || ascii_iswhite(p[0])) {
return 0;
- if (vim_iswordc_buf(p[0], buf))
+ }
+ if (vim_iswordc_tab(p[0], chartab)) {
return 2;
+ }
return 1;
}
- if (enc_dbcs != 0 && p[0] != NUL && p[1] != NUL)
- return dbcs_class(p[0], p[1]);
- if (enc_utf8)
- return utf_class(utf_ptr2char(p));
- return 0;
-}
-
-/*
- * Get class of a double-byte character. This always returns 3 or bigger.
- * TODO: Should return 1 for punctuation.
- */
-int dbcs_class(unsigned lead, unsigned trail)
-{
- switch (enc_dbcs) {
- /* please add classify routine for your language in here */
-
- case DBCS_JPNU: /* ? */
- case DBCS_JPN:
- {
- /* JIS code classification */
- unsigned char lb = lead;
- unsigned char tb = trail;
-
- /* convert process code to JIS */
- /*
- * XXX: Code page identification can not use with all
- * system! So, some other encoding information
- * will be needed.
- * In japanese: SJIS,EUC,UNICODE,(JIS)
- * Note that JIS-code system don't use as
- * process code in most system because it uses
- * escape sequences(JIS is context depend encoding).
- */
- /* assume process code is JAPANESE-EUC */
- lb &= 0x7f;
- tb &= 0x7f;
- /* exceptions */
- switch (lb << 8 | tb) {
- case 0x2121: /* ZENKAKU space */
- return 0;
- case 0x2122: /* TOU-TEN (Japanese comma) */
- case 0x2123: /* KU-TEN (Japanese period) */
- case 0x2124: /* ZENKAKU comma */
- case 0x2125: /* ZENKAKU period */
- return 1;
- case 0x213c: /* prolongedsound handled as KATAKANA */
- return 13;
- }
- /* sieved by KU code */
- switch (lb) {
- case 0x21:
- case 0x22:
- /* special symbols */
- return 10;
- case 0x23:
- /* alpha-numeric */
- return 11;
- case 0x24:
- /* hiragana */
- return 12;
- case 0x25:
- /* katakana */
- return 13;
- case 0x26:
- /* greek */
- return 14;
- case 0x27:
- /* russian */
- return 15;
- case 0x28:
- /* lines */
- return 16;
- default:
- /* kanji */
- return 17;
- }
- }
-
- case DBCS_KORU: /* ? */
- case DBCS_KOR:
- {
- /* KS code classification */
- unsigned char c1 = lead;
- unsigned char c2 = trail;
-
- /*
- * 20 : Hangul
- * 21 : Hanja
- * 22 : Symbols
- * 23 : Alpha-numeric/Roman Letter (Full width)
- * 24 : Hangul Letter(Alphabet)
- * 25 : Roman Numeral/Greek Letter
- * 26 : Box Drawings
- * 27 : Unit Symbols
- * 28 : Circled/Parenthesized Letter
- * 29 : Hiragana/Katakana
- * 30 : Cyrillic Letter
- */
-
- if (c1 >= 0xB0 && c1 <= 0xC8)
- /* Hangul */
- return 20;
-
- else if (c1 >= 0xCA && c1 <= 0xFD)
- /* Hanja */
- return 21;
- else switch (c1) {
- case 0xA1:
- case 0xA2:
- /* Symbols */
- return 22;
- case 0xA3:
- /* Alpha-numeric */
- return 23;
- case 0xA4:
- /* Hangul Letter(Alphabet) */
- return 24;
- case 0xA5:
- /* Roman Numeral/Greek Letter */
- return 25;
- case 0xA6:
- /* Box Drawings */
- return 26;
- case 0xA7:
- /* Unit Symbols */
- return 27;
- case 0xA8:
- case 0xA9:
- if (c2 <= 0xAF)
- return 25; /* Roman Letter */
- else if (c2 >= 0xF6)
- return 22; /* Symbols */
- else
- /* Circled/Parenthesized Letter */
- return 28;
- case 0xAA:
- case 0xAB:
- /* Hiragana/Katakana */
- return 29;
- case 0xAC:
- /* Cyrillic Letter */
- return 30;
- }
- }
- default:
- break;
- }
- return 3;
-}
-
-/*
- * mb_char2len() function pointer.
- * Return length in bytes of character "c".
- * Returns 1 for a single-byte character.
- */
-int latin_char2len(int c)
-{
- return 1;
-}
-
-static int dbcs_char2len(int c)
-{
- if (c >= 0x100)
- return 2;
- return 1;
-}
-
-/*
- * mb_char2bytes() function pointer.
- * Convert a character to its bytes.
- * Returns the length in bytes.
- */
-int latin_char2bytes(int c, char_u *buf)
-{
- buf[0] = c;
- return 1;
-}
-
-static int dbcs_char2bytes(int c, char_u *buf)
-{
- if (c >= 0x100) {
- buf[0] = (unsigned)c >> 8;
- buf[1] = c;
- /* Never use a NUL byte, it causes lots of trouble. It's an invalid
- * character anyway. */
- if (buf[1] == NUL)
- buf[1] = '\n';
- return 2;
- }
- buf[0] = c;
- return 1;
-}
-
-/*
- * mb_ptr2len() function pointer.
- * Get byte length of character at "*p" but stop at a NUL.
- * For UTF-8 this includes following composing characters.
- * Returns 0 when *p is NUL.
- */
-int latin_ptr2len(const char_u *p)
-{
- return MB_BYTE2LEN(*p);
-}
-
-static int dbcs_ptr2len(const char_u *p)
-{
- int len;
-
- /* Check if second byte is not missing. */
- len = MB_BYTE2LEN(*p);
- if (len == 2 && p[1] == NUL)
- len = 1;
- return len;
-}
-
-/*
- * mb_ptr2len_len() function pointer.
- * Like mb_ptr2len(), but limit to read "size" bytes.
- * Returns 0 for an empty string.
- * Returns 1 for an illegal char or an incomplete byte sequence.
- */
-int latin_ptr2len_len(const char_u *p, int size)
-{
- if (size < 1 || *p == NUL)
- return 0;
- return 1;
-}
-
-static int dbcs_ptr2len_len(const char_u *p, int size)
-{
- int len;
-
- if (size < 1 || *p == NUL)
- return 0;
- if (size == 1)
- return 1;
- /* Check that second byte is not missing. */
- len = MB_BYTE2LEN(*p);
- if (len == 2 && p[1] == NUL)
- len = 1;
- return len;
+ return utf_class_tab(utf_ptr2char(p), chartab);
}
/*
@@ -950,6 +484,9 @@ int utf_char2cells(int c)
if (intable(doublewidth, ARRAY_SIZE(doublewidth), c))
return 2;
#endif
+ if (p_emoji && intable(emoji_width, ARRAY_SIZE(emoji_width), c)) {
+ return 2;
+ }
}
/* Characters below 0x100 are influenced by 'isprint' option */
else if (c >= 0x80 && !vim_isprintc(c))
@@ -961,16 +498,8 @@ int utf_char2cells(int c)
return 1;
}
-/*
- * mb_ptr2cells() function pointer.
- * Return the number of display cells character at "*p" occupies.
- * This doesn't take care of unprintable characters, use ptr2cells() for that.
- */
-int latin_ptr2cells(const char_u *p)
-{
- return 1;
-}
-
+/// Return the number of display cells character at "*p" occupies.
+/// This doesn't take care of unprintable characters, use ptr2cells() for that.
int utf_ptr2cells(const char_u *p)
{
int c;
@@ -989,26 +518,9 @@ int utf_ptr2cells(const char_u *p)
return 1;
}
-int dbcs_ptr2cells(const char_u *p)
-{
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (enc_dbcs == DBCS_JPNU && *p == 0x8e)
- return 1;
- return MB_BYTE2LEN(*p);
-}
-
-/*
- * mb_ptr2cells_len() function pointer.
- * Like mb_ptr2cells(), but limit string length to "size".
- * For an empty string or truncated character returns 1.
- */
-int latin_ptr2cells_len(const char_u *p, int size)
-{
- return 1;
-}
-
-static int utf_ptr2cells_len(const char_u *p, int size)
+/// Like utf_ptr2cells(), but limit string length to "size".
+/// For an empty string or truncated character returns 1.
+int utf_ptr2cells_len(const char_u *p, int size)
{
int c;
@@ -1028,35 +540,6 @@ static int utf_ptr2cells_len(const char_u *p, int size)
return 1;
}
-static int dbcs_ptr2cells_len(const char_u *p, int size)
-{
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (size <= 1 || (enc_dbcs == DBCS_JPNU && *p == 0x8e))
- return 1;
- return MB_BYTE2LEN(*p);
-}
-
-/*
- * mb_char2cells() function pointer.
- * Return the number of display cells character "c" occupies.
- * Only takes care of multi-byte chars, not "^C" and such.
- */
-int latin_char2cells(int c)
-{
- return 1;
-}
-
-static int dbcs_char2cells(int c)
-{
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (enc_dbcs == DBCS_JPNU && ((unsigned)c >> 8) == 0x8e)
- return 1;
- /* use the first byte */
- return MB_BYTE2LEN((unsigned)c >> 8);
-}
-
/// Calculate the number of cells occupied by string `str`.
///
/// @param str The source string, may not be NULL, must be a NUL-terminated
@@ -1067,95 +550,60 @@ size_t mb_string2cells(const char_u *str)
size_t clen = 0;
for (const char_u *p = str; *p != NUL; p += (*mb_ptr2len)(p)) {
- clen += (*mb_ptr2cells)(p);
+ clen += utf_ptr2cells(p);
}
return clen;
}
-/*
- * mb_off2cells() function pointer.
- * Return number of display cells for char at ScreenLines[off].
- * We make sure that the offset used is less than "max_off".
- */
-int latin_off2cells(unsigned off, unsigned max_off)
-{
- return 1;
-}
-
-int dbcs_off2cells(unsigned off, unsigned max_off)
-{
- /* never check beyond end of the line */
- if (off >= max_off)
- return 1;
-
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (enc_dbcs == DBCS_JPNU && ScreenLines[off] == 0x8e)
- return 1;
- return MB_BYTE2LEN(ScreenLines[off]);
-}
-
-int utf_off2cells(unsigned off, unsigned max_off)
-{
- return (off + 1 < max_off && ScreenLines[off + 1] == 0) ? 2 : 1;
-}
-
-/*
- * mb_ptr2char() function pointer.
- * Convert a byte sequence into a character.
- */
-int latin_ptr2char(const char_u *p)
-{
- return *p;
-}
-
-static int dbcs_ptr2char(const char_u *p)
-{
- if (MB_BYTE2LEN(*p) > 1 && p[1] != NUL)
- return (p[0] << 8) + p[1];
- return *p;
-}
-
-/*
- * Convert a UTF-8 byte sequence to a wide character.
- * If the sequence is illegal or truncated by a NUL the first byte is
- * returned.
- * Does not include composing characters, of course.
- */
-int utf_ptr2char(const char_u *p)
+/// Convert a UTF-8 byte sequence to a wide character
+///
+/// If the sequence is illegal or truncated by a NUL then the first byte is
+/// returned.
+/// For an overlong sequence this may return zero.
+/// Does not include composing characters for obvious reasons.
+///
+/// @param[in] p String to convert.
+///
+/// @return Unicode codepoint or byte value.
+int utf_ptr2char(const char_u *const p)
+ FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT
{
- uint8_t len;
-
- if (p[0] < 0x80) /* be quick for ASCII */
+ if (p[0] < 0x80) { // Be quick for ASCII.
return p[0];
+ }
- len = utf8len_tab_zero[p[0]];
+ const uint8_t len = utf8len_tab_zero[p[0]];
if (len > 1 && (p[1] & 0xc0) == 0x80) {
- if (len == 2)
+ if (len == 2) {
return ((p[0] & 0x1f) << 6) + (p[1] & 0x3f);
+ }
if ((p[2] & 0xc0) == 0x80) {
- if (len == 3)
- return ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
- + (p[2] & 0x3f);
+ if (len == 3) {
+ return (((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6)
+ + (p[2] & 0x3f));
+ }
if ((p[3] & 0xc0) == 0x80) {
- if (len == 4)
- return ((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
- + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f);
+ if (len == 4) {
+ return (((p[0] & 0x07) << 18) + ((p[1] & 0x3f) << 12)
+ + ((p[2] & 0x3f) << 6) + (p[3] & 0x3f));
+ }
if ((p[4] & 0xc0) == 0x80) {
- if (len == 5)
- return ((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
- + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
- + (p[4] & 0x3f);
- if ((p[5] & 0xc0) == 0x80 && len == 6)
- return ((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
- + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
- + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f);
+ if (len == 5) {
+ return (((p[0] & 0x03) << 24) + ((p[1] & 0x3f) << 18)
+ + ((p[2] & 0x3f) << 12) + ((p[3] & 0x3f) << 6)
+ + (p[4] & 0x3f));
+ }
+ if ((p[5] & 0xc0) == 0x80 && len == 6) {
+ return (((p[0] & 0x01) << 30) + ((p[1] & 0x3f) << 24)
+ + ((p[2] & 0x3f) << 18) + ((p[3] & 0x3f) << 12)
+ + ((p[4] & 0x3f) << 6) + (p[5] & 0x3f));
+ }
}
}
}
}
- /* Illegal value, just return the first byte */
+ // Illegal value: just return the first byte.
return p[0];
}
@@ -1175,7 +623,7 @@ int utf_ptr2char(const char_u *p)
* If byte sequence is illegal or incomplete, returns -1 and does not advance
* "s".
*/
-static int utf_safe_read_char_adv(char_u **s, size_t *n)
+static int utf_safe_read_char_adv(const char_u **s, size_t *n)
{
int c;
@@ -1217,11 +665,11 @@ static int utf_safe_read_char_adv(char_u **s, size_t *n)
* Get character at **pp and advance *pp to the next character.
* Note: composing characters are skipped!
*/
-int mb_ptr2char_adv(char_u **pp)
+int mb_ptr2char_adv(const char_u **const pp)
{
int c;
- c = (*mb_ptr2char)(*pp);
+ c = utf_ptr2char(*pp);
*pp += (*mb_ptr2len)(*pp);
return c;
}
@@ -1230,15 +678,12 @@ int mb_ptr2char_adv(char_u **pp)
* Get character at **pp and advance *pp to the next character.
* Note: composing characters are returned as separate characters.
*/
-int mb_cptr2char_adv(char_u **pp)
+int mb_cptr2char_adv(const char_u **pp)
{
int c;
- c = (*mb_ptr2char)(*pp);
- if (enc_utf8)
- *pp += utf_ptr2len(*pp);
- else
- *pp += (*mb_ptr2len)(*pp);
+ c = utf_ptr2char(*pp);
+ *pp += utf_ptr2len(*pp);
return c;
}
@@ -1259,12 +704,14 @@ bool utf_composinglike(const char_u *p1, const char_u *p2)
return arabic_combine(utf_ptr2char(p1), c2);
}
-/*
- * Convert a UTF-8 byte string to a wide character. Also get up to MAX_MCO
- * composing characters.
- *
- * @param [out] pcc: composing chars, last one is 0
- */
+/// Convert a UTF-8 string to a wide character
+///
+/// Also gets up to #MAX_MCO composing characters.
+///
+/// @param[out] pcc Location where to store composing characters. Must have
+/// space at least for #MAX_MCO + 1 elements.
+///
+/// @return leading character.
int utfc_ptr2char(const char_u *p, int *pcc)
{
int len;
@@ -1338,44 +785,24 @@ int utfc_ptr2char_len(const char_u *p, int *pcc, int maxlen)
#undef ISCOMPOSING
}
-/*
- * Convert the character at screen position "off" to a sequence of bytes.
- * Includes the composing characters.
- * "buf" must at least have the length MB_MAXBYTES + 1.
- * Only to be used when ScreenLinesUC[off] != 0.
- * Returns the produced number of bytes.
- */
-int utfc_char2bytes(int off, char_u *buf)
-{
- int len;
- int i;
-
- len = utf_char2bytes(ScreenLinesUC[off], buf);
- for (i = 0; i < Screen_mco; ++i) {
- if (ScreenLinesC[i][off] == 0)
- break;
- len += utf_char2bytes(ScreenLinesC[i][off], buf + len);
- }
- return len;
-}
-
-/*
- * Get the length of a UTF-8 byte sequence, not including any following
- * composing characters.
- * Returns 0 for "".
- * Returns 1 for an illegal byte sequence.
- */
-int utf_ptr2len(const char_u *p)
+/// Get the length of a UTF-8 byte sequence representing a single codepoint
+///
+/// @param[in] p UTF-8 string.
+///
+/// @return Sequence length, 0 for empty string and 1 for non-UTF-8 byte
+/// sequence.
+int utf_ptr2len(const char_u *const p)
+ FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
{
- int len;
- int i;
-
- if (*p == NUL)
+ if (*p == NUL) {
return 0;
- len = utf8len_tab[*p];
- for (i = 1; i < len; ++i)
- if ((p[i] & 0xc0) != 0x80)
+ }
+ const int len = utf8len_tab[*p];
+ for (int i = 1; i < len; i++) {
+ if ((p[i] & 0xc0) != 0x80) {
return 1;
+ }
+ }
return len;
}
@@ -1416,38 +843,38 @@ int utf_ptr2len_len(const char_u *p, int size)
return len;
}
-/*
- * Return the number of bytes the UTF-8 encoding of the character at "p" takes.
- * This includes following composing characters.
- */
-int utfc_ptr2len(const char_u *p)
+/// Return the number of bytes occupied by a UTF-8 character in a string
+///
+/// This includes following composing characters.
+int utfc_ptr2len(const char_u *const p)
+ FUNC_ATTR_PURE FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
{
- int len;
- int b0 = *p;
- int prevlen;
+ uint8_t b0 = (uint8_t)(*p);
- if (b0 == NUL)
+ if (b0 == NUL) {
return 0;
- if (b0 < 0x80 && p[1] < 0x80) /* be quick for ASCII */
+ }
+ if (b0 < 0x80 && p[1] < 0x80) { // be quick for ASCII
return 1;
+ }
- /* Skip over first UTF-8 char, stopping at a NUL byte. */
- len = utf_ptr2len(p);
+ // Skip over first UTF-8 char, stopping at a NUL byte.
+ int len = utf_ptr2len(p);
- /* Check for illegal byte. */
- if (len == 1 && b0 >= 0x80)
+ // Check for illegal byte.
+ if (len == 1 && b0 >= 0x80) {
return 1;
+ }
- /*
- * Check for composing characters. We can handle only the first six, but
- * skip all of them (otherwise the cursor would get stuck).
- */
- prevlen = 0;
- for (;; ) {
- if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len))
+ // Check for composing characters. We can handle only the first six, but
+ // skip all of them (otherwise the cursor would get stuck).
+ int prevlen = 0;
+ for (;;) {
+ if (p[len] < 0x80 || !UTF_COMPOSINGLIKE(p + prevlen, p + len)) {
return len;
+ }
- /* Skip over composing char */
+ // Skip over composing char.
prevlen = len;
len += utf_ptr2len(p + len);
}
@@ -1505,70 +932,65 @@ int utfc_ptr2len_len(const char_u *p, int size)
return len;
}
-/*
- * Return the number of bytes the UTF-8 encoding of character "c" takes.
- * This does not include composing characters.
- */
-int utf_char2len(int c)
+/// Determine how many bytes certain unicode codepoint will occupy
+int utf_char2len(const int c)
{
- if (c < 0x80)
+ if (c < 0x80) {
return 1;
- if (c < 0x800)
+ } else if (c < 0x800) {
return 2;
- if (c < 0x10000)
+ } else if (c < 0x10000) {
return 3;
- if (c < 0x200000)
+ } else if (c < 0x200000) {
return 4;
- if (c < 0x4000000)
+ } else if (c < 0x4000000) {
return 5;
- return 6;
+ } else {
+ return 6;
+ }
}
-/*
- * Convert Unicode character "c" to UTF-8 string in "buf[]".
- * Returns the number of bytes.
- * This does not include composing characters.
- */
-int utf_char2bytes(int c, char_u *buf)
+/// Convert Unicode character to UTF-8 string
+///
+/// @param c character to convert to \p buf
+/// @param[out] buf UTF-8 string generated from \p c, does not add \0
+/// @return Number of bytes (1-6). Does not include composing characters.
+int utf_char2bytes(const int c, char_u *const buf)
{
- if (c < 0x80) { /* 7 bits */
+ if (c < 0x80) { // 7 bits
buf[0] = c;
return 1;
- }
- if (c < 0x800) { /* 11 bits */
+ } else if (c < 0x800) { // 11 bits
buf[0] = 0xc0 + ((unsigned)c >> 6);
buf[1] = 0x80 + (c & 0x3f);
return 2;
- }
- if (c < 0x10000) { /* 16 bits */
+ } else if (c < 0x10000) { // 16 bits
buf[0] = 0xe0 + ((unsigned)c >> 12);
buf[1] = 0x80 + (((unsigned)c >> 6) & 0x3f);
buf[2] = 0x80 + (c & 0x3f);
return 3;
- }
- if (c < 0x200000) { /* 21 bits */
+ } else if (c < 0x200000) { // 21 bits
buf[0] = 0xf0 + ((unsigned)c >> 18);
buf[1] = 0x80 + (((unsigned)c >> 12) & 0x3f);
buf[2] = 0x80 + (((unsigned)c >> 6) & 0x3f);
buf[3] = 0x80 + (c & 0x3f);
return 4;
- }
- if (c < 0x4000000) { /* 26 bits */
+ } else if (c < 0x4000000) { // 26 bits
buf[0] = 0xf8 + ((unsigned)c >> 24);
buf[1] = 0x80 + (((unsigned)c >> 18) & 0x3f);
buf[2] = 0x80 + (((unsigned)c >> 12) & 0x3f);
buf[3] = 0x80 + (((unsigned)c >> 6) & 0x3f);
buf[4] = 0x80 + (c & 0x3f);
return 5;
+ } else { // 31 bits
+ buf[0] = 0xfc + ((unsigned)c >> 30);
+ buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f);
+ buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f);
+ buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f);
+ buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f);
+ buf[5] = 0x80 + (c & 0x3f);
+ return 6;
}
- /* 31 bits */
- buf[0] = 0xfc + ((unsigned)c >> 30);
- buf[1] = 0x80 + (((unsigned)c >> 24) & 0x3f);
- buf[2] = 0x80 + (((unsigned)c >> 18) & 0x3f);
- buf[3] = 0x80 + (((unsigned)c >> 12) & 0x3f);
- buf[4] = 0x80 + (((unsigned)c >> 6) & 0x3f);
- buf[5] = 0x80 + (c & 0x3f);
- return 6;
}
/*
@@ -1612,7 +1034,12 @@ bool utf_printable(int c)
* 1: punctuation
* 2 or bigger: some class of word character.
*/
-int utf_class(int c)
+int utf_class(const int c)
+{
+ return utf_class_tab(c, curbuf->b_chartab);
+}
+
+int utf_class_tab(const int c, const uint64_t *const chartab)
{
/* sorted list of non-overlapping intervals */
static struct clinterval {
@@ -1695,11 +1122,13 @@ int utf_class(int c)
/* First quick check for Latin1 characters, use 'iskeyword'. */
if (c < 0x100) {
- if (c == ' ' || c == '\t' || c == NUL || c == 0xa0)
- return 0; /* blank */
- if (vim_iswordc(c))
- return 2; /* word character */
- return 1; /* punctuation */
+ if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) {
+ return 0; // blank
+ }
+ if (vim_iswordc_tab(c, chartab)) {
+ return 2; // word character
+ }
+ return 1; // punctuation
}
/* binary search in table */
@@ -1713,16 +1142,20 @@ int utf_class(int c)
return (int)classes[mid].class;
}
+ // emoji
+ if (intable(emoji_all, ARRAY_SIZE(emoji_all), c)) {
+ return 3;
+ }
+
/* most other characters are "word" characters */
return 2;
}
-/*
- * Code for Unicode case-dependent operations. Based on notes in
- * http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
- * This code uses simple case folding, not full case folding.
- * Last updated for Unicode 5.2.
- */
+bool utf_ambiguous_width(int c)
+{
+ return c >= 0x80 && (intable(ambiguous, ARRAY_SIZE(ambiguous), c)
+ || intable(emoji_all, ARRAY_SIZE(emoji_all), c));
+}
/*
* Generic conversion function for case operations.
@@ -1758,14 +1191,21 @@ static int utf_convert(int a, const convertStruct *const table, size_t n_items)
*/
int utf_fold(int a)
{
+ if (a < 0x80) {
+ // be fast for ASCII
+ return a >= 0x41 && a <= 0x5a ? a + 32 : a;
+ }
return utf_convert(a, foldCase, ARRAY_SIZE(foldCase));
}
-/*
- * Return the upper-case equivalent of "a", which is a UCS-4 character. Use
- * simple case folding.
- */
-int utf_toupper(int a)
+// Vim's own character class functions. These exist because many library
+// islower()/toupper() etc. do not work properly: they crash when used with
+// invalid values or can't handle latin1 when the locale is C.
+// Speed is most important here.
+
+/// Return the upper-case equivalent of "a", which is a UCS-4 character. Use
+/// simple case folding.
+int mb_toupper(int a)
{
/* If 'casemap' contains "keepascii" use ASCII style toupper(). */
if (a < 128 && (cmp_flags & CMP_KEEPASCII))
@@ -1785,17 +1225,15 @@ int utf_toupper(int a)
return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
}
-bool utf_islower(int a)
+bool mb_islower(int a)
{
- /* German sharp s is lower case but has no upper case equivalent. */
- return (utf_toupper(a) != a) || a == 0xdf;
+ // German sharp s is lower case but has no upper case equivalent.
+ return (mb_toupper(a) != a) || a == 0xdf;
}
-/*
- * Return the lower-case equivalent of "a", which is a UCS-4 character. Use
- * simple case folding.
- */
-int utf_tolower(int a)
+/// Return the lower-case equivalent of "a", which is a UCS-4 character. Use
+/// simple case folding.
+int mb_tolower(int a)
{
/* If 'casemap' contains "keepascii" use ASCII style tolower(). */
if (a < 128 && (cmp_flags & CMP_KEEPASCII))
@@ -1815,12 +1253,13 @@ int utf_tolower(int a)
return utf_convert(a, toLower, ARRAY_SIZE(toLower));
}
-bool utf_isupper(int a)
+bool mb_isupper(int a)
{
- return utf_tolower(a) != a;
+ return mb_tolower(a) != a;
}
-static int utf_strnicmp(char_u *s1, char_u *s2, size_t n1, size_t n2)
+static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1,
+ size_t n2)
{
int c1, c2, cdiff;
char_u buffer[6];
@@ -1885,6 +1324,93 @@ static int utf_strnicmp(char_u *s1, char_u *s2, size_t n1, size_t n2)
return n1 == 0 ? -1 : 1;
}
+#ifdef WIN32
+#ifndef CP_UTF8
+# define CP_UTF8 65001 /* magic number from winnls.h */
+#endif
+
+/// Reassigns `strw` to a new, allocated pointer to a UTF16 string.
+int utf8_to_utf16(const char *str, wchar_t **strw)
+ FUNC_ATTR_NONNULL_ALL
+{
+ ssize_t wchar_len = 0;
+
+ // Compute the length needed to store the converted widechar string.
+ wchar_len = MultiByteToWideChar(CP_UTF8,
+ 0, // dwFlags: must be 0 for utf8
+ str, // lpMultiByteStr: string to convert
+ -1, // -1 => process up to NUL
+ NULL, // lpWideCharStr: converted string
+ 0); // 0 => return length, don't convert
+ if (wchar_len == 0) {
+ return GetLastError();
+ }
+
+ ssize_t buf_sz = wchar_len * sizeof(wchar_t);
+
+ if (buf_sz == 0) {
+ *strw = NULL;
+ return 0;
+ }
+
+ char *buf = xmalloc(buf_sz);
+ char *pos = buf;
+
+ int r = MultiByteToWideChar(CP_UTF8,
+ 0,
+ str,
+ -1,
+ (wchar_t *)pos,
+ wchar_len);
+ assert(r == wchar_len);
+ if (r != wchar_len) {
+ EMSG2("MultiByteToWideChar failed: %d", r);
+ }
+ *strw = (wchar_t *)pos;
+
+ return 0;
+}
+
+/// Reassigns `str` to a new, allocated pointer to a UTF8 string.
+int utf16_to_utf8(const wchar_t *strw, char **str)
+ FUNC_ATTR_NONNULL_ALL
+{
+ // Compute the space required to store the string as UTF-8.
+ DWORD utf8_len = WideCharToMultiByte(CP_UTF8,
+ 0,
+ strw,
+ -1,
+ NULL,
+ 0,
+ NULL,
+ NULL);
+ if (utf8_len == 0) {
+ return GetLastError();
+ }
+
+ *str = xmallocz(utf8_len);
+
+ // Convert to UTF-8.
+ utf8_len = WideCharToMultiByte(CP_UTF8,
+ 0,
+ strw,
+ -1,
+ *str,
+ utf8_len,
+ NULL,
+ NULL);
+ if (utf8_len == 0) {
+ free(*str);
+ *str = NULL;
+ return GetLastError();
+ }
+ (*str)[utf8_len] = '\0';
+
+ return 0;
+}
+
+#endif
+
/*
* Version of strnicmp() that handles multi-byte characters.
* Needed for Big5, Shift-JIS and UTF-8 encoding. Other DBCS encodings can
@@ -1893,48 +1419,26 @@ static int utf_strnicmp(char_u *s1, char_u *s2, size_t n1, size_t n2)
* Returns zero if s1 and s2 are equal (ignoring case), the difference between
* two characters otherwise.
*/
-int mb_strnicmp(char_u *s1, char_u *s2, size_t nn)
+int mb_strnicmp(const char_u *s1, const char_u *s2, const size_t nn)
{
- int i, l;
- int cdiff;
- int n = (int)nn;
-
- if (enc_utf8) {
- return utf_strnicmp(s1, s2, nn, nn);
- } else {
- for (i = 0; i < n; i += l) {
- if (s1[i] == NUL && s2[i] == NUL) /* both strings end */
- return 0;
-
- l = (*mb_ptr2len)(s1 + i);
- if (l <= 1) {
- /* Single byte: first check normally, then with ignore case. */
- if (s1[i] != s2[i]) {
- cdiff = vim_tolower(s1[i]) - vim_tolower(s2[i]);
- if (cdiff != 0)
- return cdiff;
- }
- } else {
- /* For non-Unicode multi-byte don't ignore case. */
- if (l > n - i)
- l = n - i;
- cdiff = STRNCMP(s1 + i, s2 + i, l);
- if (cdiff != 0)
- return cdiff;
- }
- }
- }
- return 0;
+ return utf_strnicmp(s1, s2, nn, nn);
}
-/* We need to call mb_stricmp() even when we aren't dealing with a multi-byte
- * encoding because mb_stricmp() takes care of all ascii and non-ascii
- * encodings, including characters with umlauts in latin1, etc., while
- * STRICMP() only handles the system locale version, which often does not
- * handle non-ascii properly. */
-int mb_stricmp(char_u *s1, char_u *s2)
+/// Compare strings case-insensitively
+///
+/// @note We need to call mb_stricmp() even when we aren't dealing with
+/// a multi-byte encoding because mb_stricmp() takes care of all ASCII and
+/// non-ascii encodings, including characters with umlauts in latin1,
+/// etc., while STRICMP() only handles the system locale version, which
+/// often does not handle non-ascii properly.
+///
+/// @param[in] s1 First string to compare, not more then #MAXCOL characters.
+/// @param[in] s2 Second string to compare, not more then #MAXCOL characters.
+///
+/// @return 0 if strings are equal, <0 if s1 < s2, >0 if s1 > s2.
+int mb_stricmp(const char *s1, const char *s2)
{
- return mb_strnicmp(s1, s2, MAXCOL);
+ return mb_strnicmp((const char_u *)s1, (const char_u *)s2, MAXCOL);
}
/*
@@ -1979,68 +1483,9 @@ void show_utf8(void)
msg(IObuff);
}
-/*
- * mb_head_off() function pointer.
- * Return offset from "p" to the first byte of the character it points into.
- * If "p" points to the NUL at the end of the string return 0.
- * Returns 0 when already at the first byte of a character.
- */
-int latin_head_off(const char_u *base, const char_u *p)
-{
- return 0;
-}
-
-int dbcs_head_off(const char_u *base, const char_u *p)
-{
- /* It can't be a trailing byte when not using DBCS, at the start of the
- * string or the previous byte can't start a double-byte. */
- if (p <= base || MB_BYTE2LEN(p[-1]) == 1 || *p == NUL) {
- return 0;
- }
-
- /* This is slow: need to start at the base and go forward until the
- * byte we are looking for. Return 1 when we went past it, 0 otherwise. */
- const char_u *q = base;
- while (q < p) {
- q += dbcs_ptr2len(q);
- }
-
- return (q == p) ? 0 : 1;
-}
-
-/*
- * Special version of dbcs_head_off() that works for ScreenLines[], where
- * single-width DBCS_JPNU characters are stored separately.
- */
-int dbcs_screen_head_off(const char_u *base, const char_u *p)
-{
- /* It can't be a trailing byte when not using DBCS, at the start of the
- * string or the previous byte can't start a double-byte.
- * For euc-jp an 0x8e byte in the previous cell always means we have a
- * lead byte in the current cell. */
- if (p <= base
- || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e)
- || MB_BYTE2LEN(p[-1]) == 1
- || *p == NUL)
- return 0;
-
- /* This is slow: need to start at the base and go forward until the
- * byte we are looking for. Return 1 when we went past it, 0 otherwise.
- * For DBCS_JPNU look out for 0x8e, which means the second byte is not
- * stored as the next byte. */
- const char_u *q = base;
- while (q < p) {
- if (enc_dbcs == DBCS_JPNU && *q == 0x8e) {
- ++q;
- }
- else {
- q += dbcs_ptr2len(q);
- }
- }
-
- return (q == p) ? 0 : 1;
-}
-
+/// Return offset from "p" to the first byte of the character it points into.
+/// If "p" points to the NUL at the end of the string return 0.
+/// Returns 0 when already at the first byte of a character.
int utf_head_off(const char_u *base, const char_u *p)
{
int c;
@@ -2089,14 +1534,15 @@ int utf_head_off(const char_u *base, const char_u *p)
return (int)(p - q);
}
-/*
- * Copy a character from "*fp" to "*tp" and advance the pointers.
- */
-void mb_copy_char(const char_u **fp, char_u **tp)
+/// Copy a character, advancing the pointers
+///
+/// @param[in,out] fp Source of the character to copy.
+/// @param[in,out] tp Destination to copy to.
+void mb_copy_char(const char_u **const fp, char_u **const tp)
{
- int l = (*mb_ptr2len)(*fp);
+ const size_t l = (size_t)utfc_ptr2len(*fp);
- memmove(*tp, *fp, (size_t)l);
+ memmove(*tp, *fp, l);
*tp += l;
*fp += l;
}
@@ -2111,27 +1557,24 @@ int mb_off_next(char_u *base, char_u *p)
int i;
int j;
- if (enc_utf8) {
- if (*p < 0x80) /* be quick for ASCII */
- return 0;
+ if (*p < 0x80) { // be quick for ASCII
+ return 0;
+ }
- /* Find the next character that isn't 10xx.xxxx */
- for (i = 0; (p[i] & 0xc0) == 0x80; ++i)
- ;
- if (i > 0) {
- /* Check for illegal sequence. */
- for (j = 0; p - j > base; ++j)
- if ((p[-j] & 0xc0) != 0x80)
- break;
- if (utf8len_tab[p[-j]] != i + j)
- return 0;
+ // Find the next character that isn't 10xx.xxxx
+ for (i = 0; (p[i] & 0xc0) == 0x80; i++) {}
+ if (i > 0) {
+ // Check for illegal sequence.
+ for (j = 0; p - j > base; j++) {
+ if ((p[-j] & 0xc0) != 0x80) {
+ break;
+ }
+ }
+ if (utf8len_tab[p[-j]] != i + j) {
+ return 0;
}
- return i;
}
-
- /* Only need to check if we're on a trail byte, it doesn't matter if we
- * want the offset to the next or current character. */
- return (*mb_head_off)(base, p);
+ return i;
}
/*
@@ -2146,26 +1589,20 @@ int mb_tail_off(char_u *base, char_u *p)
if (*p == NUL)
return 0;
- if (enc_utf8) {
- /* Find the last character that is 10xx.xxxx */
- for (i = 0; (p[i + 1] & 0xc0) == 0x80; ++i)
- ;
- /* Check for illegal sequence. */
- for (j = 0; p - j > base; ++j)
- if ((p[-j] & 0xc0) != 0x80)
- break;
- if (utf8len_tab[p[-j]] != i + j + 1)
- return 0;
- return i;
+ // Find the last character that is 10xx.xxxx
+ for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
+
+ // Check for illegal sequence.
+ for (j = 0; p - j > base; j++) {
+ if ((p[-j] & 0xc0) != 0x80) {
+ break;
+ }
}
- /* It can't be the first byte if a double-byte when not using DBCS, at the
- * end of the string or the byte can't start a double-byte. */
- if (enc_dbcs == 0 || p[1] == NUL || MB_BYTE2LEN(*p) == 1)
+ if (utf8len_tab[p[-j]] != i + j + 1) {
return 0;
-
- /* Return 1 when on the lead byte, 0 when on the tail byte. */
- return 1 - dbcs_head_off(base, p);
+ }
+ return i;
}
/*
@@ -2180,10 +1617,10 @@ void utf_find_illegal(void)
char_u *tofree = NULL;
vimconv.vc_type = CONV_NONE;
- if (enc_utf8 && (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT)) {
- /* 'encoding' is "utf-8" but we are editing a 8-bit encoded file,
- * possibly a utf-8 file with illegal bytes. Setup for conversion
- * from utf-8 to 'fileencoding'. */
+ if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) {
+ // 'encoding' is "utf-8" but we are editing a 8-bit encoded file,
+ // possibly a utf-8 file with illegal bytes. Setup for conversion
+ // from utf-8 to 'fileencoding'.
convert_setup(&vimconv, p_enc, curbuf->b_p_fenc);
}
@@ -2240,29 +1677,42 @@ theend:
*/
void mb_adjust_cursor(void)
{
- mb_adjustpos(curbuf, &curwin->w_cursor);
+ mark_mb_adjustpos(curbuf, &curwin->w_cursor);
}
-/*
- * Adjust position "*lp" to point to the first byte of a multi-byte character.
- * If it points to a tail byte it's moved backwards to the head byte.
- */
-void mb_adjustpos(buf_T *buf, pos_T *lp)
+/// Checks and adjusts cursor column. Not mode-dependent.
+/// @see check_cursor_col_win
+///
+/// @param win_ Places cursor on a valid column for this window.
+void mb_check_adjust_col(void *win_)
{
- char_u *p;
+ win_T *win = (win_T *)win_;
+ colnr_T oldcol = win->w_cursor.col;
- if (lp->col > 0
- || lp->coladd > 1
- ) {
- p = ml_get_buf(buf, lp->lnum, FALSE);
- lp->col -= (*mb_head_off)(p, p + lp->col);
- /* Reset "coladd" when the cursor would be on the right half of a
- * double-wide character. */
- if (lp->coladd == 1
- && p[lp->col] != TAB
- && vim_isprintc((*mb_ptr2char)(p + lp->col))
- && ptr2cells(p + lp->col) > 1)
- lp->coladd = 0;
+ // Column 0 is always valid.
+ if (oldcol != 0) {
+ char_u *p = ml_get_buf(win->w_buffer, win->w_cursor.lnum, false);
+ colnr_T len = (colnr_T)STRLEN(p);
+
+ // Empty line or invalid column?
+ if (len == 0 || oldcol < 0) {
+ win->w_cursor.col = 0;
+ } else {
+ // Cursor column too big for line?
+ if (oldcol > len) {
+ win->w_cursor.col = len - 1;
+ }
+ // Move the cursor to the head byte.
+ win->w_cursor.col -= utf_head_off(p, p + win->w_cursor.col);
+ }
+
+ // Reset `coladd` when the cursor would be on the right half of a
+ // double-wide character.
+ if (win->w_cursor.coladd == 1 && p[win->w_cursor.col] != TAB
+ && vim_isprintc(utf_ptr2char(p + win->w_cursor.col))
+ && ptr2cells(p + win->w_cursor.col) > 1) {
+ win->w_cursor.coladd = 0;
+ }
}
}
@@ -2274,8 +1724,9 @@ char_u * mb_prevptr(
char_u *p
)
{
- if (p > line)
- mb_ptr_back(line, p);
+ if (p > line) {
+ MB_PTR_BACK(line, p);
+ }
return p;
}
@@ -2311,85 +1762,59 @@ int mb_charlen_len(char_u *str, int len)
return count;
}
-/*
- * Try to un-escape a multi-byte character.
- * Used for the "to" and "from" part of a mapping.
- * Return the un-escaped string if it is a multi-byte character, and advance
- * "pp" to just after the bytes that formed it.
- * Return NULL if no multi-byte char was found.
- */
-char_u * mb_unescape(char_u **pp)
-{
- static char_u buf[6];
- int n;
- int m = 0;
- char_u *str = *pp;
-
- /* Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI
- * KS_EXTRA KE_CSI to CSI.
- * Maximum length of a utf-8 character is 4 bytes. */
- for (n = 0; str[n] != NUL && m < 4; ++n) {
- if (str[n] == K_SPECIAL
- && str[n + 1] == KS_SPECIAL
- && str[n + 2] == KE_FILLER) {
- buf[m++] = K_SPECIAL;
- n += 2;
- } else if ((str[n] == K_SPECIAL
- )
- && str[n + 1] == KS_EXTRA
- && str[n + 2] == (int)KE_CSI) {
- buf[m++] = CSI;
- n += 2;
- } else if (str[n] == K_SPECIAL
- )
- break; /* a special key can't be a multibyte char */
- else
- buf[m++] = str[n];
- buf[m] = NUL;
+/// Try to unescape a multibyte character
+///
+/// Used for the rhs and lhs of the mappings.
+///
+/// @param[in,out] pp String to unescape. Is advanced to just after the bytes
+/// that form a multibyte character.
+///
+/// @return Unescaped string if it is a multibyte character, NULL if no
+/// multibyte character was found. Returns a static buffer, always one
+/// and the same.
+const char *mb_unescape(const char **const pp)
+ FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_NONNULL_ALL
+{
+ static char buf[6];
+ size_t buf_idx = 0;
+ uint8_t *str = (uint8_t *)(*pp);
+
+ // Must translate K_SPECIAL KS_SPECIAL KE_FILLER to K_SPECIAL and CSI
+ // KS_EXTRA KE_CSI to CSI.
+ // Maximum length of a utf-8 character is 4 bytes.
+ for (size_t str_idx = 0; str[str_idx] != NUL && buf_idx < 4; str_idx++) {
+ if (str[str_idx] == K_SPECIAL
+ && str[str_idx + 1] == KS_SPECIAL
+ && str[str_idx + 2] == KE_FILLER) {
+ buf[buf_idx++] = (char)K_SPECIAL;
+ str_idx += 2;
+ } else if ((str[str_idx] == K_SPECIAL)
+ && str[str_idx + 1] == KS_EXTRA
+ && str[str_idx + 2] == KE_CSI) {
+ buf[buf_idx++] = (char)CSI;
+ str_idx += 2;
+ } else if (str[str_idx] == K_SPECIAL) {
+ break; // A special key can't be a multibyte char.
+ } else {
+ buf[buf_idx++] = (char)str[str_idx];
+ }
+ buf[buf_idx] = NUL;
- /* Return a multi-byte character if it's found. An illegal sequence
- * will result in a 1 here. */
- if ((*mb_ptr2len)(buf) > 1) {
- *pp = str + n + 1;
+ // Return a multi-byte character if it's found. An illegal sequence
+ // will result in a 1 here.
+ if (utf_ptr2len((const char_u *)buf) > 1) {
+ *pp = (const char *)str + str_idx + 1;
return buf;
}
- /* Bail out quickly for ASCII. */
- if (buf[0] < 128)
+ // Bail out quickly for ASCII.
+ if ((uint8_t)buf[0] < 128) {
break;
+ }
}
return NULL;
}
-/*
- * Return true if the character at "row"/"col" on the screen is the left side
- * of a double-width character.
- * Caller must make sure "row" and "col" are not invalid!
- */
-bool mb_lefthalve(int row, int col)
-{
- return (*mb_off2cells)(LineOffset[row] + col,
- LineOffset[row] + screen_Columns) > 1;
-}
-
-/*
- * Correct a position on the screen, if it's the right half of a double-wide
- * char move it to the left half. Returns the corrected column.
- */
-int mb_fix_col(int col, int row)
-{
- col = check_col(col);
- row = check_row(row);
- if (has_mbyte && ScreenLines != NULL && col > 0
- && ((enc_dbcs
- && ScreenLines[LineOffset[row] + col] != NUL
- && dbcs_screen_head_off(ScreenLines + LineOffset[row],
- ScreenLines + LineOffset[row] + col))
- || (enc_utf8 && ScreenLines[LineOffset[row] + col] == 0)))
- return col - 1;
- return col;
-}
-
/*
* Skip the Vim specific head of a 'encoding' name.
@@ -2515,36 +1940,39 @@ char_u * enc_locale(void)
return NULL;
}
- /* The most generic locale format is:
- * language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]]
- * If there is a '.' remove the part before it.
- * if there is something after the codeset, remove it.
- * Make the name lowercase and replace '_' with '-'.
- * Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn",
- * "ko_KR.EUC" == "euc-kr"
- */
+ // The most generic locale format is:
+ // language[_territory][.codeset][@modifier][+special][,[sponsor][_revision]]
+ // If there is a '.' remove the part before it.
+ // if there is something after the codeset, remove it.
+ // Make the name lowercase and replace '_' with '-'.
+ // Exception: "ja_JP.EUC" == "euc-jp", "zh_CN.EUC" = "euc-cn",
+ // "ko_KR.EUC" == "euc-kr"
const char *p = (char *)vim_strchr((char_u *)s, '.');
if (p != NULL) {
if (p > s + 2 && !STRNICMP(p + 1, "EUC", 3)
&& !isalnum((int)p[4]) && p[4] != '-' && p[-3] == '_') {
- /* copy "XY.EUC" to "euc-XY" to buf[10] */
- strcpy(buf + 10, "euc-");
- buf[14] = p[-2];
- buf[15] = p[-1];
- buf[16] = 0;
- s = buf + 10;
- } else
+ // Copy "XY.EUC" to "euc-XY" to buf[10].
+ memmove(buf, "euc-", 4);
+ buf[4] = (ASCII_ISALNUM(p[-2]) ? TOLOWER_ASC(p[-2]) : 0);
+ buf[5] = (ASCII_ISALNUM(p[-1]) ? TOLOWER_ASC(p[-1]) : 0);
+ buf[6] = NUL;
+ } else {
s = p + 1;
+ goto enc_locale_copy_enc;
+ }
+ } else {
+enc_locale_copy_enc:
+ for (i = 0; i < (int)sizeof(buf) - 1 && s[i] != NUL; i++) {
+ if (s[i] == '_' || s[i] == '-') {
+ buf[i] = '-';
+ } else if (ASCII_ISALNUM((uint8_t)s[i])) {
+ buf[i] = TOLOWER_ASC(s[i]);
+ } else {
+ break;
+ }
+ }
+ buf[i] = NUL;
}
- for (i = 0; s[i] != NUL && i < (int)sizeof(buf) - 1; ++i) {
- if (s[i] == '_' || s[i] == '-')
- buf[i] = '-';
- else if (isalnum((int)s[i]))
- buf[i] = TOLOWER_ASC(s[i]);
- else
- break;
- }
- buf[i] = NUL;
return enc_canonize((char_u *)buf);
}
@@ -2571,9 +1999,10 @@ void * my_iconv_open(char_u *to, char_u *from)
return (void *)-1; /* detected a broken iconv() previously */
#ifdef DYNAMIC_ICONV
- /* Check if the iconv.dll can be found. */
- if (!iconv_enabled(true))
+ // Check if the iconv.dll can be found.
+ if (!iconv_enabled(true)) {
return (void *)-1;
+ }
#endif
fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from));
@@ -2607,8 +2036,8 @@ void * my_iconv_open(char_u *to, char_u *from)
* Returns the converted string in allocated memory. NULL for an error.
* If resultlenp is not NULL, sets it to the result length in bytes.
*/
-static char_u * iconv_string(vimconv_T *vcp, char_u *str, size_t slen,
- size_t *unconvlenp, size_t *resultlenp)
+static char_u *iconv_string(const vimconv_T *const vcp, char_u *str,
+ size_t slen, size_t *unconvlenp, size_t *resultlenp)
{
const char *from;
size_t fromlen;
@@ -2662,15 +2091,10 @@ static char_u * iconv_string(vimconv_T *vcp, char_u *str, size_t slen,
* conversion from 'encoding' to something else. In other
* situations we don't know what to skip anyway. */
*to++ = '?';
- if ((*mb_ptr2cells)((char_u *)from) > 1)
+ if (utf_ptr2cells((char_u *)from) > 1) {
*to++ = '?';
- if (enc_utf8)
- l = utfc_ptr2len_len((const char_u *)from, (int)fromlen);
- else {
- l = (*mb_ptr2len)((char_u *)from);
- if (l > (int)fromlen)
- l = (int)fromlen;
}
+ l = utfc_ptr2len_len((const char_u *)from, (int)fromlen);
from += l;
fromlen -= l;
} else if (ICONV_ERRNO != ICONV_E2BIG) {
@@ -2701,7 +2125,7 @@ static HINSTANCE hMsvcrtDLL = 0;
# ifndef DYNAMIC_ICONV_DLL
# define DYNAMIC_ICONV_DLL "iconv.dll"
-# define DYNAMIC_ICONV_DLL_ALT "libiconv.dll"
+# define DYNAMIC_ICONV_DLL_ALT "libiconv-2.dll"
# endif
# ifndef DYNAMIC_MSVCRT_DLL
# define DYNAMIC_MSVCRT_DLL "msvcrt.dll"
@@ -2747,6 +2171,35 @@ static void * get_iconv_import_func(HINSTANCE hInst,
return NULL;
}
+// Load library "name".
+HINSTANCE vimLoadLib(char *name)
+{
+ HINSTANCE dll = NULL;
+
+ // NOTE: Do not use mch_dirname() and mch_chdir() here, they may call
+ // vimLoadLib() recursively, which causes a stack overflow.
+ wchar_t old_dirw[MAXPATHL];
+
+ // Path to exe dir.
+ char *buf = xstrdup((char *)get_vim_var_str(VV_PROGPATH));
+ // ptrdiff_t len = ;
+ // assert(len > 0);
+ buf[path_tail_with_sep(buf) - buf] = '\0';
+
+ if (GetCurrentDirectoryW(MAXPATHL, old_dirw) != 0) {
+ // Change directory to where the executable is, both to make
+ // sure we find a .dll there and to avoid looking for a .dll
+ // in the current directory.
+ SetCurrentDirectory((LPCSTR)buf);
+ // TODO(justinmk): use uv_dlopen instead. see os_libcall
+ dll = LoadLibrary(name);
+ SetCurrentDirectoryW(old_dirw);
+ }
+
+ return dll;
+}
+
+
/*
* Try opening the iconv.dll and return TRUE if iconv() can be used.
*/
@@ -2794,10 +2247,13 @@ bool iconv_enabled(bool verbose)
void iconv_end(void)
{
- if (hIconvDLL != 0)
+ if (hIconvDLL != 0) {
+ // TODO(justinmk): use uv_dlclose instead.
FreeLibrary(hIconvDLL);
- if (hMsvcrtDLL != 0)
+ }
+ if (hMsvcrtDLL != 0) {
FreeLibrary(hMsvcrtDLL);
+ }
hIconvDLL = 0;
hMsvcrtDLL = 0;
}
@@ -2839,9 +2295,7 @@ int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8,
if (vcp->vc_type == CONV_ICONV && vcp->vc_fd != (iconv_t)-1)
iconv_close(vcp->vc_fd);
# endif
- vcp->vc_type = CONV_NONE;
- vcp->vc_factor = 1;
- vcp->vc_fail = false;
+ *vcp = (vimconv_T)MBYTE_NONE_CONV;
/* No conversion when one of the names is empty or they are equal. */
if (from == NULL || *from == NUL || to == NULL || *to == NUL
@@ -2899,7 +2353,7 @@ int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8,
* Illegal chars are often changed to "?", unless vcp->vc_fail is set.
* When something goes wrong, NULL is returned and "*lenp" is unchanged.
*/
-char_u * string_convert(vimconv_T *vcp, char_u *ptr, size_t *lenp)
+char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp)
{
return string_convert_ext(vcp, ptr, lenp, NULL);
}
@@ -2909,7 +2363,7 @@ char_u * string_convert(vimconv_T *vcp, char_u *ptr, size_t *lenp)
* an incomplete sequence at the end it is not converted and "*unconvlenp" is
* set to the number of remaining bytes.
*/
-char_u * string_convert_ext(vimconv_T *vcp, char_u *ptr,
+char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr,
size_t *lenp, size_t *unconvlenp)
{
char_u *retval = NULL;
@@ -3037,23 +2491,3 @@ char_u * string_convert_ext(vimconv_T *vcp, char_u *ptr,
return retval;
}
-
-// Check bounds for column number
-static int check_col(int col)
-{
- if (col < 0)
- return 0;
- if (col >= screen_Columns)
- return screen_Columns - 1;
- return col;
-}
-
-// Check bounds for row number
-static int check_row(int row)
-{
- if (row < 0)
- return 0;
- if (row >= screen_Rows)
- return screen_Rows - 1;
- return row;
-}