aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBjörn Linse <bjorn.linse@gmail.com>2016-11-05 18:12:14 +0100
committerGitHub <noreply@github.com>2016-11-05 18:12:14 +0100
commit9147331e212e8d0fff0f30344007faffd42609ee (patch)
tree6d1853a72b94ce19ba0cc163c664a9a206da0091 /src
parent32d9c19e294f38a6adae6e055fc606fc3fd33f2f (diff)
parent4ab3fe8eaadb5456eeafc49df2fb0ecf71d836cc (diff)
downloadrneovim-9147331e212e8d0fff0f30344007faffd42609ee.tar.gz
rneovim-9147331e212e8d0fff0f30344007faffd42609ee.tar.bz2
rneovim-9147331e212e8d0fff0f30344007faffd42609ee.zip
Merge pull request #2905 from bfredl/utf8
Only allow encoding=utf-8 and simplify multibyte code
Diffstat (limited to 'src')
-rw-r--r--src/nvim/charset.c16
-rw-r--r--src/nvim/fileio.c5
-rw-r--r--src/nvim/globals.h42
-rw-r--r--src/nvim/macros.h49
-rw-r--r--src/nvim/main.c1
-rw-r--r--src/nvim/mbyte.c609
-rw-r--r--src/nvim/mbyte.h16
-rw-r--r--src/nvim/ops.c3
-rw-r--r--src/nvim/option.c44
-rw-r--r--src/nvim/screen.c5
-rw-r--r--src/nvim/spell.c6
-rw-r--r--src/nvim/tui/input.c4
12 files changed, 129 insertions, 671 deletions
diff --git a/src/nvim/charset.c b/src/nvim/charset.c
index 61c5b10808..c501b7e83f 100644
--- a/src/nvim/charset.c
+++ b/src/nvim/charset.c
@@ -1612,9 +1612,7 @@ bool vim_islower(int c)
return false;
}
- if (enc_latin1like) {
- return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER;
- }
+ return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER;
}
return islower(c);
}
@@ -1643,9 +1641,7 @@ bool vim_isupper(int c)
return false;
}
- if (enc_latin1like) {
- return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER;
- }
+ return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER;
}
return isupper(c);
}
@@ -1670,9 +1666,7 @@ int vim_toupper(int c)
return c;
}
- if (enc_latin1like) {
- return latin1upper[c];
- }
+ return latin1upper[c];
}
return TOUPPER_LOC(c);
}
@@ -1697,9 +1691,7 @@ int vim_tolower(int c)
return c;
}
- if (enc_latin1like) {
- return latin1lower[c];
- }
+ return latin1lower[c];
}
return TOLOWER_LOC(c);
}
diff --git a/src/nvim/fileio.c b/src/nvim/fileio.c
index c0d4a71b35..0eb475d425 100644
--- a/src/nvim/fileio.c
+++ b/src/nvim/fileio.c
@@ -4165,9 +4165,8 @@ static bool need_conversion(const char_u *fenc)
same_encoding = (enc_flags != 0 && fenc_flags == enc_flags);
}
if (same_encoding) {
- /* Specified encoding matches with 'encoding'. This requires
- * conversion when 'encoding' is Unicode but not UTF-8. */
- return enc_unicode != 0;
+ // Specified file encoding matches UTF-8.
+ return false;
}
/* Encodings differ. However, conversion is not needed when 'enc' is any
diff --git a/src/nvim/globals.h b/src/nvim/globals.h
index 87fb928b30..e42382ad00 100644
--- a/src/nvim/globals.h
+++ b/src/nvim/globals.h
@@ -778,44 +778,18 @@ EXTERN int vr_lines_changed INIT(= 0); /* #Lines changed by "gR" so far */
# define DBCS_2BYTE 1 /* 2byte- */
# define DBCS_DEBUG -1
-EXTERN int enc_dbcs INIT(= 0); /* One of DBCS_xxx values if
- DBCS encoding */
-EXTERN int enc_unicode INIT(= 0); /* 2: UCS-2 or UTF-16, 4: UCS-4 */
-EXTERN bool enc_utf8 INIT(= false); /* UTF-8 encoded Unicode */
-EXTERN int enc_latin1like INIT(= TRUE); /* 'encoding' is latin1 comp. */
-EXTERN int has_mbyte INIT(= 0); /* any multi-byte encoding */
+// mbyte flags that used to depend on 'encoding'. These are now deprecated, as
+// 'encoding' is always "utf-8". Code that use them can be refactored to
+// remove dead code.
+#define enc_dbcs false
+#define enc_utf8 true
+#define has_mbyte true
/// Encoding used when 'fencs' is set to "default"
EXTERN char_u *fenc_default INIT(= NULL);
-/*
- * To speed up BYTELEN() we fill a table with the byte lengths whenever
- * enc_utf8 or enc_dbcs changes.
- */
-EXTERN char mb_bytelen_tab[256];
-
-/*
- * Function pointers, used to quickly get to the right function. Each has
- * three possible values: latin_ (8-bit), utfc_ or utf_ (utf-8) and dbcs_
- * (DBCS).
- * The value is set in mb_init();
- */
-/* length of char in bytes, including following composing chars */
-EXTERN int (*mb_ptr2len)(const char_u *p) INIT(= latin_ptr2len);
-/* idem, with limit on string length */
-EXTERN int (*mb_ptr2len_len)(const char_u *p, int size) INIT(= latin_ptr2len_len);
-/* byte length of char */
-EXTERN int (*mb_char2len)(int c) INIT(= latin_char2len);
-/* convert char to bytes, return the length */
-EXTERN int (*mb_char2bytes)(int c, char_u *buf) INIT(= latin_char2bytes);
-EXTERN int (*mb_ptr2cells)(const char_u *p) INIT(= latin_ptr2cells);
-EXTERN int (*mb_ptr2cells_len)(const char_u *p, int size) INIT(
- = latin_ptr2cells_len);
-EXTERN int (*mb_char2cells)(int c) INIT(= latin_char2cells);
-EXTERN int (*mb_off2cells)(unsigned off, unsigned max_off) INIT(
- = latin_off2cells);
-EXTERN int (*mb_ptr2char)(const char_u *p) INIT(= latin_ptr2char);
-EXTERN int (*mb_head_off)(const char_u *base, const char_u *p) INIT(= latin_head_off);
+// To speed up BYTELEN() we keep a table with the byte lengths for utf-8
+EXTERN char utf8len_tab[256];
# if defined(USE_ICONV) && defined(DYNAMIC_ICONV)
/* Pointers to functions and variables to be loaded at runtime */
diff --git a/src/nvim/macros.h b/src/nvim/macros.h
index 503daa9648..79e545771e 100644
--- a/src/nvim/macros.h
+++ b/src/nvim/macros.h
@@ -122,32 +122,29 @@
/* Whether to draw the vertical bar on the right side of the cell. */
# define CURSOR_BAR_RIGHT (curwin->w_p_rl && (!(State & CMDLINE) || cmdmsg_rl))
-/*
- * mb_ptr_adv(): advance a pointer to the next character, taking care of
- * multi-byte characters if needed.
- * mb_ptr_back(): backup a pointer to the previous character, taking care of
- * multi-byte characters if needed.
- * MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers.
- * PTR2CHAR(): get character from pointer.
- */
-/* Get the length of the character p points to */
-# define MB_PTR2LEN(p) (has_mbyte ? (*mb_ptr2len)(p) : 1)
-/* Advance multi-byte pointer, skip over composing chars. */
-# define mb_ptr_adv(p) (p += has_mbyte ? (*mb_ptr2len)((char_u *)p) : 1)
-/* Advance multi-byte pointer, do not skip over composing chars. */
-# define mb_cptr_adv(p) (p += \
- enc_utf8 ? utf_ptr2len(p) : has_mbyte ? (*mb_ptr2len)(p) : 1)
-/* Backup multi-byte pointer. Only use with "p" > "s" ! */
-# define mb_ptr_back(s, p) (p -= has_mbyte ? ((*mb_head_off)((char_u *)s, (char_u *)p - 1) + 1) : 1)
-/* get length of multi-byte char, not including composing chars */
-# define mb_cptr2len(p) (enc_utf8 ? utf_ptr2len(p) : (*mb_ptr2len)(p))
-
-# define MB_COPY_CHAR(f, t) \
- if (has_mbyte) mb_copy_char((const char_u **)(&f), &t); \
- else *t++ = *f++
-# define MB_CHARLEN(p) (has_mbyte ? mb_charlen(p) : (int)STRLEN(p))
-# define MB_CHAR2LEN(c) (has_mbyte ? mb_char2len(c) : 1)
-# define PTR2CHAR(p) (has_mbyte ? mb_ptr2char(p) : (int)*(p))
+// mb_ptr_adv(): advance a pointer to the next character, taking care of
+// multi-byte characters if needed.
+// mb_ptr_back(): backup a pointer to the previous character, taking care of
+// multi-byte characters if needed.
+// MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers.
+// PTR2CHAR(): get character from pointer.
+
+// Get the length of the character p points to
+# define MB_PTR2LEN(p) mb_ptr2len(p)
+// Advance multi-byte pointer, skip over composing chars.
+# define mb_ptr_adv(p) (p += mb_ptr2len((char_u *)p))
+// Advance multi-byte pointer, do not skip over composing chars.
+# define mb_cptr_adv(p) (p += utf_ptr2len(p))
+// Backup multi-byte pointer. Only use with "p" > "s" !
+# define mb_ptr_back(s, p) (p -= mb_head_off((char_u *)s, (char_u *)p - 1) + 1)
+// get length of multi-byte char, not including composing chars
+# define mb_cptr2len(p) utf_ptr2len(p)
+
+# define MB_COPY_CHAR(f, t) mb_copy_char((const char_u **)(&f), &t);
+
+# define MB_CHARLEN(p) mb_charlen(p)
+# define MB_CHAR2LEN(c) mb_char2len(c)
+# define PTR2CHAR(p) mb_ptr2char(p)
# define RESET_BINDING(wp) (wp)->w_p_scb = FALSE; (wp)->w_p_crb = FALSE
diff --git a/src/nvim/main.c b/src/nvim/main.c
index eb67483d08..ffd9353252 100644
--- a/src/nvim/main.c
+++ b/src/nvim/main.c
@@ -177,7 +177,6 @@ void early_init(void)
fs_init();
handle_init();
- (void)mb_init(); // init mb_bytelen_tab[] to ones
eval_init(); // init global variables
// Init the table of Normal mode commands.
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index e6312f9c00..7be0be7106 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1,68 +1,27 @@
-/*
- * mbyte.c: Code specifically for handling multi-byte characters.
- * Multibyte extensions partly by Sung-Hoon Baek
- *
- * The encoding used in the core is set with 'encoding'. When 'encoding' is
- * changed, the following four variables are set (for speed).
- * Currently these types of character encodings are supported:
- *
- * "enc_dbcs" When non-zero it tells the type of double byte character
- * encoding (Chinese, Korean, Japanese, etc.).
- * The cell width on the display is equal to the number of
- * bytes. (exception: DBCS_JPNU with first byte 0x8e)
- * Recognizing the first or second byte is difficult, it
- * requires checking a byte sequence from the start.
- * "enc_utf8" When TRUE use Unicode characters in UTF-8 encoding.
- * The cell width on the display needs to be determined from
- * the character value.
- * Recognizing bytes is easy: 0xxx.xxxx is a single-byte
- * char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading
- * byte of a multi-byte character.
- * To make things complicated, up to six composing characters
- * are allowed. These are drawn on top of the first char.
- * For most editing the sequence of bytes with composing
- * characters included is considered to be one character.
- * "enc_unicode" When 2 use 16-bit Unicode characters (or UTF-16).
- * When 4 use 32-but Unicode characters.
- * Internally characters are stored in UTF-8 encoding to
- * avoid NUL bytes. Conversion happens when doing I/O.
- * "enc_utf8" will also be TRUE.
- *
- * "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero.
- *
- * If none of these is TRUE, 8-bit bytes are used for a character. The
- * encoding isn't currently specified (TODO).
- *
- * 'encoding' specifies the encoding used in the core. This is in registers,
- * text manipulation, buffers, etc. Conversion has to be done when characters
- * in another encoding are received or send:
- *
- * clipboard
- * ^
- * | (2)
- * V
- * +---------------+
- * (1) | | (3)
- * keyboard ----->| core |-----> display
- * | |
- * +---------------+
- * ^
- * | (4)
- * V
- * file
- *
- * (1) Typed characters arrive in the current locale.
- * (2) Text will be made available with the encoding specified with
- * 'encoding'. If this is not sufficient, system-specific conversion
- * might be required.
- * (3) For the GUI the correct font must be selected, no conversion done.
- * (4) The encoding of the file is specified with 'fileencoding'. Conversion
- * is to be done when it's different from 'encoding'.
- *
- * The ShaDa file is a special case: Only text is converted, not file names.
- * Vim scripts may contain an ":encoding" command. This has an effect for
- * some commands, like ":menutrans"
- */
+/// mbyte.c: Code specifically for handling multi-byte characters.
+/// Multibyte extensions partly by Sung-Hoon Baek
+///
+/// The encoding used in nvim is always UTF-8. "enc_utf8" and "has_mbyte" is
+/// thus always true. "enc_dbcs" is always zero. The 'encoding' option is
+/// read-only and always reads "utf-8".
+///
+/// The cell width on the display needs to be determined from the character
+/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
+/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
+/// character. To make things complicated, up to six composing characters
+/// are allowed. These are drawn on top of the first char. For most editing
+/// the sequence of bytes with composing characters included is considered to
+/// be one character.
+///
+/// UTF-8 is used everywhere in the core. This is in registers, text
+/// manipulation, buffers, etc. Nvim core communicates with external plugins
+/// and GUIs in this encoding.
+///
+/// The encoding of a file is specified with 'fileencoding'. Conversion
+/// is to be done when it's different from "utf-8".
+///
+/// Vim scripts may contain an ":scriptencoding" command. This has an effect
+/// for some commands, like ":menutrans".
#include <inttypes.h>
#include <stdbool.h>
@@ -115,7 +74,7 @@ struct interval {
* Bytes which are illegal when used as the first byte have a 1.
* The NUL byte has length 1.
*/
-static char utf8len_tab[256] =
+char utf8len_tab[256] =
{
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -385,207 +344,6 @@ int enc_canon_props(const char_u *name)
}
/*
- * Set up for using multi-byte characters.
- * Called in three cases:
- * - by main() to initialize (p_enc == NULL)
- * - by set_init_1() after 'encoding' was set to its default.
- * - by do_set() when 'encoding' has been set.
- * p_enc must have been passed through enc_canonize() already.
- * Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags.
- * Fills mb_bytelen_tab[] and returns NULL when there are no problems.
- * When there is something wrong: Returns an error message and doesn't change
- * anything.
- */
-char_u * mb_init(void)
-{
- int i;
- int idx;
- int n;
- int enc_dbcs_new = 0;
-#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \
- && !defined(MACOS)
-# define LEN_FROM_CONV
- vimconv_T vimconv;
- char_u *p;
-#endif
-
- if (p_enc == NULL) {
- /* Just starting up: set the whole table to one's. */
- for (i = 0; i < 256; ++i)
- mb_bytelen_tab[i] = 1;
- return NULL;
- } else if (STRNCMP(p_enc, "8bit-", 5) == 0
- || STRNCMP(p_enc, "iso-8859-", 9) == 0) {
- /* Accept any "8bit-" or "iso-8859-" name. */
- enc_unicode = 0;
- enc_utf8 = false;
- } else if (STRNCMP(p_enc, "2byte-", 6) == 0) {
- /* Unix: accept any "2byte-" name, assume current locale. */
- enc_dbcs_new = DBCS_2BYTE;
- } else if ((idx = enc_canon_search(p_enc)) >= 0) {
- i = enc_canon_table[idx].prop;
- if (i & ENC_UNICODE) {
- /* Unicode */
- enc_utf8 = true;
- if (i & (ENC_2BYTE | ENC_2WORD))
- enc_unicode = 2;
- else if (i & ENC_4BYTE)
- enc_unicode = 4;
- else
- enc_unicode = 0;
- } else if (i & ENC_DBCS) {
- /* 2byte, handle below */
- enc_dbcs_new = enc_canon_table[idx].codepage;
- } else {
- /* Must be 8-bit. */
- enc_unicode = 0;
- enc_utf8 = false;
- }
- } else /* Don't know what encoding this is, reject it. */
- return e_invarg;
-
- if (enc_dbcs_new != 0) {
- enc_unicode = 0;
- enc_utf8 = false;
- }
- enc_dbcs = enc_dbcs_new;
- has_mbyte = (enc_dbcs != 0 || enc_utf8);
-
-
- /* Detect an encoding that uses latin1 characters. */
- enc_latin1like = (enc_utf8 || STRCMP(p_enc, "latin1") == 0
- || STRCMP(p_enc, "iso-8859-15") == 0);
-
- /*
- * Set the function pointers.
- */
- if (enc_utf8) {
- mb_ptr2len = utfc_ptr2len;
- mb_ptr2len_len = utfc_ptr2len_len;
- mb_char2len = utf_char2len;
- mb_char2bytes = utf_char2bytes;
- mb_ptr2cells = utf_ptr2cells;
- mb_ptr2cells_len = utf_ptr2cells_len;
- mb_char2cells = utf_char2cells;
- mb_off2cells = utf_off2cells;
- mb_ptr2char = utf_ptr2char;
- mb_head_off = utf_head_off;
- } else if (enc_dbcs != 0) {
- mb_ptr2len = dbcs_ptr2len;
- mb_ptr2len_len = dbcs_ptr2len_len;
- mb_char2len = dbcs_char2len;
- mb_char2bytes = dbcs_char2bytes;
- mb_ptr2cells = dbcs_ptr2cells;
- mb_ptr2cells_len = dbcs_ptr2cells_len;
- mb_char2cells = dbcs_char2cells;
- mb_off2cells = dbcs_off2cells;
- mb_ptr2char = dbcs_ptr2char;
- mb_head_off = dbcs_head_off;
- } else {
- mb_ptr2len = latin_ptr2len;
- mb_ptr2len_len = latin_ptr2len_len;
- mb_char2len = latin_char2len;
- mb_char2bytes = latin_char2bytes;
- mb_ptr2cells = latin_ptr2cells;
- mb_ptr2cells_len = latin_ptr2cells_len;
- mb_char2cells = latin_char2cells;
- mb_off2cells = latin_off2cells;
- mb_ptr2char = latin_ptr2char;
- mb_head_off = latin_head_off;
- }
-
- /*
- * Fill the mb_bytelen_tab[] for MB_BYTE2LEN().
- */
-#ifdef LEN_FROM_CONV
- /* When 'encoding' is different from the current locale mblen() won't
- * work. Use conversion to "utf-8" instead. */
- vimconv.vc_type = CONV_NONE;
- if (enc_dbcs) {
- p = enc_locale();
- if (p == NULL || STRCMP(p, p_enc) != 0) {
- convert_setup(&vimconv, p_enc, (char_u *)"utf-8");
- vimconv.vc_fail = true;
- }
- xfree(p);
- }
-#endif
-
- for (i = 0; i < 256; ++i) {
- /* Our own function to reliably check the length of UTF-8 characters,
- * independent of mblen(). */
- if (enc_utf8)
- n = utf8len_tab[i];
- else if (enc_dbcs == 0)
- n = 1;
- else {
- char buf[MB_MAXBYTES + 1];
- if (i == NUL) /* just in case mblen() can't handle "" */
- n = 1;
- else {
- buf[0] = i;
- buf[1] = 0;
-#ifdef LEN_FROM_CONV
- if (vimconv.vc_type != CONV_NONE) {
- /*
- * string_convert() should fail when converting the first
- * byte of a double-byte character.
- */
- p = string_convert(&vimconv, (char_u *)buf, NULL);
- if (p != NULL) {
- xfree(p);
- n = 1;
- } else
- n = 2;
- } else
-#endif
- {
- /*
- * mblen() should return -1 for invalid (means the leading
- * multibyte) character. However there are some platforms
- * where mblen() returns 0 for invalid character.
- * Therefore, following condition includes 0.
- */
- ignored = mblen(NULL, 0); /* First reset the state. */
- if (mblen(buf, (size_t)1) <= 0)
- n = 2;
- else
- n = 1;
- }
- }
- }
- mb_bytelen_tab[i] = n;
- }
-
-#ifdef LEN_FROM_CONV
- convert_setup(&vimconv, NULL, NULL);
-#endif
-
- /* The cell width depends on the type of multi-byte characters. */
- (void)init_chartab();
-
- /* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */
- screenalloc(false);
-
-#ifdef HAVE_WORKING_LIBINTL
- /* GNU gettext 0.10.37 supports this feature: set the codeset used for
- * translated messages independently from the current locale. */
- (void)bind_textdomain_codeset(PROJECT_NAME,
- enc_utf8 ? "utf-8" : (char *)p_enc);
-#endif
-
-
- /* Fire an autocommand to let people do custom font setup. This must be
- * after Vim has been setup for the new encoding. */
- apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf);
-
- /* Need to reload spell dictionaries */
- spell_reload();
-
- return NULL;
-}
-
-/*
* Return the size of the BOM for the current buffer:
* 0 - no BOM
* 2 - UCS-2 or UTF-16 BOM
@@ -597,20 +355,15 @@ int bomb_size(void)
int n = 0;
if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
- if (*curbuf->b_p_fenc == NUL) {
- if (enc_utf8) {
- if (enc_unicode != 0)
- n = enc_unicode;
- else
- n = 3;
- }
- } else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0)
+ if (*curbuf->b_p_fenc == NUL
+ || STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
n = 3;
- else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
- || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0)
+ } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
+ || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
n = 2;
- else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0)
+ } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
n = 4;
+ }
}
return n;
}
@@ -804,99 +557,6 @@ int dbcs_class(unsigned lead, unsigned trail)
}
/*
- * mb_char2len() function pointer.
- * Return length in bytes of character "c".
- * Returns 1 for a single-byte character.
- */
-int latin_char2len(int c)
-{
- return 1;
-}
-
-static int dbcs_char2len(int c)
-{
- if (c >= 0x100)
- return 2;
- return 1;
-}
-
-/*
- * mb_char2bytes() function pointer.
- * Convert a character to its bytes.
- * Returns the length in bytes.
- */
-int latin_char2bytes(int c, char_u *buf)
-{
- buf[0] = c;
- return 1;
-}
-
-static int dbcs_char2bytes(int c, char_u *buf)
-{
- if (c >= 0x100) {
- buf[0] = (unsigned)c >> 8;
- buf[1] = c;
- /* Never use a NUL byte, it causes lots of trouble. It's an invalid
- * character anyway. */
- if (buf[1] == NUL)
- buf[1] = '\n';
- return 2;
- }
- buf[0] = c;
- return 1;
-}
-
-/*
- * mb_ptr2len() function pointer.
- * Get byte length of character at "*p" but stop at a NUL.
- * For UTF-8 this includes following composing characters.
- * Returns 0 when *p is NUL.
- */
-int latin_ptr2len(const char_u *p)
-{
- return MB_BYTE2LEN(*p);
-}
-
-static int dbcs_ptr2len(const char_u *p)
-{
- int len;
-
- /* Check if second byte is not missing. */
- len = MB_BYTE2LEN(*p);
- if (len == 2 && p[1] == NUL)
- len = 1;
- return len;
-}
-
-/*
- * mb_ptr2len_len() function pointer.
- * Like mb_ptr2len(), but limit to read "size" bytes.
- * Returns 0 for an empty string.
- * Returns 1 for an illegal char or an incomplete byte sequence.
- */
-int latin_ptr2len_len(const char_u *p, int size)
-{
- if (size < 1 || *p == NUL)
- return 0;
- return 1;
-}
-
-static int dbcs_ptr2len_len(const char_u *p, int size)
-{
- int len;
-
- if (size < 1 || *p == NUL)
- return 0;
- if (size == 1)
- return 1;
- /* Check that second byte is not missing. */
- len = MB_BYTE2LEN(*p);
- if (len == 2 && p[1] == NUL)
- len = 1;
- return len;
-}
-
-/*
* Return true if "c" is in "table".
*/
static bool intable(const struct interval *table, size_t n_items, int c)
@@ -963,16 +623,8 @@ int utf_char2cells(int c)
return 1;
}
-/*
- * mb_ptr2cells() function pointer.
- * Return the number of display cells character at "*p" occupies.
- * This doesn't take care of unprintable characters, use ptr2cells() for that.
- */
-int latin_ptr2cells(const char_u *p)
-{
- return 1;
-}
-
+/// Return the number of display cells character at "*p" occupies.
+/// This doesn't take care of unprintable characters, use ptr2cells() for that.
int utf_ptr2cells(const char_u *p)
{
int c;
@@ -991,26 +643,9 @@ int utf_ptr2cells(const char_u *p)
return 1;
}
-int dbcs_ptr2cells(const char_u *p)
-{
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (enc_dbcs == DBCS_JPNU && *p == 0x8e)
- return 1;
- return MB_BYTE2LEN(*p);
-}
-
-/*
- * mb_ptr2cells_len() function pointer.
- * Like mb_ptr2cells(), but limit string length to "size".
- * For an empty string or truncated character returns 1.
- */
-int latin_ptr2cells_len(const char_u *p, int size)
-{
- return 1;
-}
-
-static int utf_ptr2cells_len(const char_u *p, int size)
+/// Like utf_ptr2cells(), but limit string length to "size".
+/// For an empty string or truncated character returns 1.
+int utf_ptr2cells_len(const char_u *p, int size)
{
int c;
@@ -1030,35 +665,6 @@ static int utf_ptr2cells_len(const char_u *p, int size)
return 1;
}
-static int dbcs_ptr2cells_len(const char_u *p, int size)
-{
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (size <= 1 || (enc_dbcs == DBCS_JPNU && *p == 0x8e))
- return 1;
- return MB_BYTE2LEN(*p);
-}
-
-/*
- * mb_char2cells() function pointer.
- * Return the number of display cells character "c" occupies.
- * Only takes care of multi-byte chars, not "^C" and such.
- */
-int latin_char2cells(int c)
-{
- return 1;
-}
-
-static int dbcs_char2cells(int c)
-{
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (enc_dbcs == DBCS_JPNU && ((unsigned)c >> 8) == 0x8e)
- return 1;
- /* use the first byte */
- return MB_BYTE2LEN((unsigned)c >> 8);
-}
-
/// Calculate the number of cells occupied by string `str`.
///
/// @param str The source string, may not be NULL, must be a NUL-terminated
@@ -1075,51 +681,14 @@ size_t mb_string2cells(const char_u *str)
return clen;
}
-/*
- * mb_off2cells() function pointer.
- * Return number of display cells for char at ScreenLines[off].
- * We make sure that the offset used is less than "max_off".
- */
-int latin_off2cells(unsigned off, unsigned max_off)
-{
- return 1;
-}
-
-int dbcs_off2cells(unsigned off, unsigned max_off)
-{
- /* never check beyond end of the line */
- if (off >= max_off)
- return 1;
-
- /* Number of cells is equal to number of bytes, except for euc-jp when
- * the first byte is 0x8e. */
- if (enc_dbcs == DBCS_JPNU && ScreenLines[off] == 0x8e)
- return 1;
- return MB_BYTE2LEN(ScreenLines[off]);
-}
-
+/// Return number of display cells for char at ScreenLines[off].
+/// We make sure that the offset used is less than "max_off".
int utf_off2cells(unsigned off, unsigned max_off)
{
return (off + 1 < max_off && ScreenLines[off + 1] == 0) ? 2 : 1;
}
/*
- * mb_ptr2char() function pointer.
- * Convert a byte sequence into a character.
- */
-int latin_ptr2char(const char_u *p)
-{
- return *p;
-}
-
-static int dbcs_ptr2char(const char_u *p)
-{
- if (MB_BYTE2LEN(*p) > 1 && p[1] != NUL)
- return (p[0] << 8) + p[1];
- return *p;
-}
-
-/*
* Convert a UTF-8 byte sequence to a wide character.
* If the sequence is illegal or truncated by a NUL the first byte is
* returned.
@@ -2065,68 +1634,9 @@ void show_utf8(void)
msg(IObuff);
}
-/*
- * mb_head_off() function pointer.
- * Return offset from "p" to the first byte of the character it points into.
- * If "p" points to the NUL at the end of the string return 0.
- * Returns 0 when already at the first byte of a character.
- */
-int latin_head_off(const char_u *base, const char_u *p)
-{
- return 0;
-}
-
-int dbcs_head_off(const char_u *base, const char_u *p)
-{
- /* It can't be a trailing byte when not using DBCS, at the start of the
- * string or the previous byte can't start a double-byte. */
- if (p <= base || MB_BYTE2LEN(p[-1]) == 1 || *p == NUL) {
- return 0;
- }
-
- /* This is slow: need to start at the base and go forward until the
- * byte we are looking for. Return 1 when we went past it, 0 otherwise. */
- const char_u *q = base;
- while (q < p) {
- q += dbcs_ptr2len(q);
- }
-
- return (q == p) ? 0 : 1;
-}
-
-/*
- * Special version of dbcs_head_off() that works for ScreenLines[], where
- * single-width DBCS_JPNU characters are stored separately.
- */
-int dbcs_screen_head_off(const char_u *base, const char_u *p)
-{
- /* It can't be a trailing byte when not using DBCS, at the start of the
- * string or the previous byte can't start a double-byte.
- * For euc-jp an 0x8e byte in the previous cell always means we have a
- * lead byte in the current cell. */
- if (p <= base
- || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e)
- || MB_BYTE2LEN(p[-1]) == 1
- || *p == NUL)
- return 0;
-
- /* This is slow: need to start at the base and go forward until the
- * byte we are looking for. Return 1 when we went past it, 0 otherwise.
- * For DBCS_JPNU look out for 0x8e, which means the second byte is not
- * stored as the next byte. */
- const char_u *q = base;
- while (q < p) {
- if (enc_dbcs == DBCS_JPNU && *q == 0x8e) {
- ++q;
- }
- else {
- q += dbcs_ptr2len(q);
- }
- }
-
- return (q == p) ? 0 : 1;
-}
-
+/// Return offset from "p" to the first byte of the character it points into.
+/// If "p" points to the NUL at the end of the string return 0.
+/// Returns 0 when already at the first byte of a character.
int utf_head_off(const char_u *base, const char_u *p)
{
int c;
@@ -2232,26 +1742,20 @@ int mb_tail_off(char_u *base, char_u *p)
if (*p == NUL)
return 0;
- if (enc_utf8) {
- /* Find the last character that is 10xx.xxxx */
- for (i = 0; (p[i + 1] & 0xc0) == 0x80; ++i)
- ;
- /* Check for illegal sequence. */
- for (j = 0; p - j > base; ++j)
- if ((p[-j] & 0xc0) != 0x80)
- break;
- if (utf8len_tab[p[-j]] != i + j + 1)
- return 0;
- return i;
+ // Find the last character that is 10xx.xxxx
+ for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
+
+ // Check for illegal sequence.
+ for (j = 0; p - j > base; j++) {
+ if ((p[-j] & 0xc0) != 0x80) {
+ break;
+ }
}
- /* It can't be the first byte if a double-byte when not using DBCS, at the
- * end of the string or the byte can't start a double-byte. */
- if (enc_dbcs == 0 || p[1] == NUL || MB_BYTE2LEN(*p) == 1)
+ if (utf8len_tab[p[-j]] != i + j + 1) {
return 0;
-
- /* Return 1 when on the lead byte, 0 when on the tail byte. */
- return 1 - dbcs_head_off(base, p);
+ }
+ return i;
}
/*
@@ -2466,13 +1970,10 @@ int mb_fix_col(int col, int row)
{
col = check_col(col);
row = check_row(row);
- if (has_mbyte && ScreenLines != NULL && col > 0
- && ((enc_dbcs
- && ScreenLines[LineOffset[row] + col] != NUL
- && dbcs_screen_head_off(ScreenLines + LineOffset[row],
- ScreenLines + LineOffset[row] + col))
- || (enc_utf8 && ScreenLines[LineOffset[row] + col] == 0)))
+ if (ScreenLines != NULL && col > 0
+ && ScreenLines[LineOffset[row] + col] == 0) {
return col - 1;
+ }
return col;
}
diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h
index 0cfe2c4bab..2c92a0fbb2 100644
--- a/src/nvim/mbyte.h
+++ b/src/nvim/mbyte.h
@@ -9,8 +9,8 @@
* MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.
* Don't call MB_BYTE2LEN(b) with b < 0 or b > 255!
*/
-#define MB_BYTE2LEN(b) mb_bytelen_tab[b]
-#define MB_BYTE2LEN_CHECK(b) (((b) < 0 || (b) > 255) ? 1 : mb_bytelen_tab[b])
+#define MB_BYTE2LEN(b) utf8len_tab[b]
+#define MB_BYTE2LEN_CHECK(b) (((b) < 0 || (b) > 255) ? 1 : utf8len_tab[b])
/* properties used in enc_canon_table[] (first three mutually exclusive) */
#define ENC_8BIT 0x01
@@ -28,6 +28,18 @@
#define ENC_LATIN9 0x400 /* Latin9 */
#define ENC_MACROMAN 0x800 /* Mac Roman (not Macro Man! :-) */
+// TODO(bfredl): eventually we should keep only one of the namings
+#define mb_ptr2len utfc_ptr2len
+#define mb_ptr2len_len utfc_ptr2len_len
+#define mb_char2len utf_char2len
+#define mb_char2bytes utf_char2bytes
+#define mb_ptr2cells utf_ptr2cells
+#define mb_ptr2cells_len utf_ptr2cells_len
+#define mb_char2cells utf_char2cells
+#define mb_off2cells utf_off2cells
+#define mb_ptr2char utf_ptr2char
+#define mb_head_off utf_head_off
+
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.h.generated.h"
#endif
diff --git a/src/nvim/ops.c b/src/nvim/ops.c
index 388a72adce..0263bd15da 100644
--- a/src/nvim/ops.c
+++ b/src/nvim/ops.c
@@ -1936,8 +1936,7 @@ int swapchar(int op_type, pos_T *pos)
if (c >= 0x80 && op_type == OP_ROT13)
return FALSE;
- if (op_type == OP_UPPER && c == 0xdf
- && (enc_latin1like || STRCMP(p_enc, "iso-8859-2") == 0)) {
+ if (op_type == OP_UPPER && c == 0xdf) {
pos_T sp = curwin->w_cursor;
/* Special handling of German sharp s: change to "SS". */
diff --git a/src/nvim/option.c b/src/nvim/option.c
index a255165e32..311982982b 100644
--- a/src/nvim/option.c
+++ b/src/nvim/option.c
@@ -780,14 +780,11 @@ void set_init_1(void)
}
fenc_default = p;
- // Initialize multibyte (utf-8) handling
- mb_init();
-
- // Don't change &encoding when resetting to defaults with ":set all&".
- opt_idx = findoption((char_u *)"encoding");
- if (opt_idx >= 0) {
- options[opt_idx].flags |= P_NODEFAULT;
- }
+#ifdef HAVE_WORKING_LIBINTL
+ // GNU gettext 0.10.37 supports this feature: set the codeset used for
+ // translated messages independently from the current locale.
+ (void)bind_textdomain_codeset(PROJECT_NAME, (char *)p_enc);
+#endif
/* Set the default for 'helplang'. */
set_helplang_default(get_mess_lang());
@@ -2580,19 +2577,17 @@ did_set_string_option (
errmsg = e_invarg;
/* 'encoding' and 'fileencoding' */
} else if (varp == &p_enc || gvarp == &p_fenc) {
- if (varp == &p_enc && did_source_startup_scripts) {
- errmsg = e_afterinit;
- } else if (gvarp == &p_fenc) {
- if (!MODIFIABLE(curbuf) && opt_flags != OPT_GLOBAL)
+ if (gvarp == &p_fenc) {
+ if (!MODIFIABLE(curbuf) && opt_flags != OPT_GLOBAL) {
errmsg = e_modifiable;
- else if (vim_strchr(*varp, ',') != NULL)
- /* No comma allowed in 'fileencoding'; catches confusing it
- * with 'fileencodings'. */
+ } else if (vim_strchr(*varp, ',') != NULL) {
+ // No comma allowed in 'fileencoding'; catches confusing it
+ // with 'fileencodings'.
errmsg = e_invarg;
- else {
- /* May show a "+" in the title now. */
+ } else {
+ // May show a "+" in the title now.
redraw_titles();
- /* Add 'fileencoding' to the swap file. */
+ // Add 'fileencoding' to the swap file.
ml_setflags(curbuf);
}
}
@@ -2603,17 +2598,12 @@ did_set_string_option (
xfree(*varp);
*varp = p;
if (varp == &p_enc) {
- errmsg = mb_init();
- redraw_titles();
+ // only encoding=utf-8 allowed
+ if (STRCMP(p_enc, "utf-8") != 0) {
+ errmsg = e_invarg;
+ }
}
}
-
- if (errmsg == NULL) {
- /* When 'keymap' is used and 'encoding' changes, reload the keymap
- * (with another encoding). */
- if (varp == &p_enc && *curbuf->b_p_keymap != NUL)
- (void)keymap_init();
- }
} else if (varp == &p_penc) {
/* Canonize printencoding if VIM standard one */
p = enc_canonize(p_penc);
diff --git a/src/nvim/screen.c b/src/nvim/screen.c
index 3e4d016fe7..cee3c62f43 100644
--- a/src/nvim/screen.c
+++ b/src/nvim/screen.c
@@ -5292,7 +5292,7 @@ void screen_puts_len(char_u *text, int textlen, int row, int col, int attr)
int force_redraw_next = FALSE;
int need_redraw;
- const int l_has_mbyte = has_mbyte;
+ const bool l_has_mbyte = has_mbyte;
const bool l_enc_utf8 = enc_utf8;
const int l_enc_dbcs = enc_dbcs;
@@ -5459,9 +5459,6 @@ void screen_puts_len(char_u *text, int textlen, int row, int col, int attr)
/* If we detected the next character needs to be redrawn, but the text
* doesn't extend up to there, update the character here. */
if (force_redraw_next && col < screen_Columns) {
- if (l_enc_dbcs != 0 && dbcs_off2cells(off, max_off) > 1)
- screen_char_2(off, row, col);
- else
screen_char(off, row, col);
}
}
diff --git a/src/nvim/spell.c b/src/nvim/spell.c
index ba7f31be25..d9cdce8ca4 100644
--- a/src/nvim/spell.c
+++ b/src/nvim/spell.c
@@ -9266,9 +9266,7 @@ static void allcap_copy(char_u *word, char_u *wcopy)
else
c = *s++;
- // We only change 0xdf to SS when we are certain latin1 is used. It
- // would cause weird errors in other 8-bit encodings.
- if (enc_latin1like && c == 0xdf) {
+ if (c == 0xdf) {
c = 'S';
if (d - wcopy >= MAXWLEN - 1)
break;
@@ -12602,7 +12600,7 @@ static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword)
char_u *p;
int wbadword[MAXWLEN];
int wgoodword[MAXWLEN];
- const int l_has_mbyte = has_mbyte;
+ const bool l_has_mbyte = has_mbyte;
if (l_has_mbyte) {
// Get the characters from the multi-byte strings and put them in an
diff --git a/src/nvim/tui/input.c b/src/nvim/tui/input.c
index 740716f0ef..9dc66420b0 100644
--- a/src/nvim/tui/input.c
+++ b/src/nvim/tui/input.c
@@ -31,8 +31,8 @@ void term_input_init(TermInput *input, Loop *loop)
if (!term) {
term = ""; // termkey_new_abstract assumes non-null (#2745)
}
- int enc_flag = enc_utf8 ? TERMKEY_FLAG_UTF8 : TERMKEY_FLAG_RAW;
- input->tk = termkey_new_abstract(term, enc_flag);
+
+ input->tk = termkey_new_abstract(term, TERMKEY_FLAG_UTF8);
int curflags = termkey_get_canonflags(input->tk);
termkey_set_canonflags(input->tk, curflags | TERMKEY_CANON_DELBS);