diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/nvim/charset.c | 16 | ||||
| -rw-r--r-- | src/nvim/fileio.c | 5 | ||||
| -rw-r--r-- | src/nvim/globals.h | 42 | ||||
| -rw-r--r-- | src/nvim/macros.h | 49 | ||||
| -rw-r--r-- | src/nvim/main.c | 1 | ||||
| -rw-r--r-- | src/nvim/mbyte.c | 609 | ||||
| -rw-r--r-- | src/nvim/mbyte.h | 16 | ||||
| -rw-r--r-- | src/nvim/ops.c | 3 | ||||
| -rw-r--r-- | src/nvim/option.c | 44 | ||||
| -rw-r--r-- | src/nvim/screen.c | 5 | ||||
| -rw-r--r-- | src/nvim/spell.c | 6 | ||||
| -rw-r--r-- | src/nvim/tui/input.c | 4 | 
12 files changed, 129 insertions, 671 deletions
| diff --git a/src/nvim/charset.c b/src/nvim/charset.c index 61c5b10808..c501b7e83f 100644 --- a/src/nvim/charset.c +++ b/src/nvim/charset.c @@ -1612,9 +1612,7 @@ bool vim_islower(int c)        return false;      } -    if (enc_latin1like) { -      return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER; -    } +    return (latin1flags[c] & LATIN1LOWER) == LATIN1LOWER;    }    return islower(c);  } @@ -1643,9 +1641,7 @@ bool vim_isupper(int c)        return false;      } -    if (enc_latin1like) { -      return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER; -    } +    return (latin1flags[c] & LATIN1UPPER) == LATIN1UPPER;    }    return isupper(c);  } @@ -1670,9 +1666,7 @@ int vim_toupper(int c)        return c;      } -    if (enc_latin1like) { -      return latin1upper[c]; -    } +    return latin1upper[c];    }    return TOUPPER_LOC(c);  } @@ -1697,9 +1691,7 @@ int vim_tolower(int c)        return c;      } -    if (enc_latin1like) { -      return latin1lower[c]; -    } +    return latin1lower[c];    }    return TOLOWER_LOC(c);  } diff --git a/src/nvim/fileio.c b/src/nvim/fileio.c index c0d4a71b35..0eb475d425 100644 --- a/src/nvim/fileio.c +++ b/src/nvim/fileio.c @@ -4165,9 +4165,8 @@ static bool need_conversion(const char_u *fenc)      same_encoding = (enc_flags != 0 && fenc_flags == enc_flags);    }    if (same_encoding) { -    /* Specified encoding matches with 'encoding'.  This requires -     * conversion when 'encoding' is Unicode but not UTF-8. */ -    return enc_unicode != 0; +    // Specified file encoding matches UTF-8. +    return false;    }    /* Encodings differ.  However, conversion is not needed when 'enc' is any diff --git a/src/nvim/globals.h b/src/nvim/globals.h index 87fb928b30..e42382ad00 100644 --- a/src/nvim/globals.h +++ b/src/nvim/globals.h @@ -778,44 +778,18 @@ EXTERN int vr_lines_changed INIT(= 0);      /* #Lines changed by "gR" so far */  # define DBCS_2BYTE     1       /* 2byte- */  # define DBCS_DEBUG     -1 -EXTERN int enc_dbcs INIT(= 0);                  /* One of DBCS_xxx values if -                                                   DBCS encoding */ -EXTERN int enc_unicode INIT(= 0);       /* 2: UCS-2 or UTF-16, 4: UCS-4 */ -EXTERN bool enc_utf8 INIT(= false);             /* UTF-8 encoded Unicode */ -EXTERN int enc_latin1like INIT(= TRUE);         /* 'encoding' is latin1 comp. */ -EXTERN int has_mbyte INIT(= 0);                 /* any multi-byte encoding */ +// mbyte flags that used to depend on 'encoding'. These are now deprecated, as +// 'encoding' is always "utf-8". Code that use them can be refactored to +// remove dead code. +#define enc_dbcs false +#define enc_utf8 true +#define has_mbyte true  /// Encoding used when 'fencs' is set to "default"  EXTERN char_u *fenc_default INIT(= NULL); -/* - * To speed up BYTELEN() we fill a table with the byte lengths whenever - * enc_utf8 or enc_dbcs changes. - */ -EXTERN char mb_bytelen_tab[256]; - -/* - * Function pointers, used to quickly get to the right function.  Each has - * three possible values: latin_ (8-bit), utfc_ or utf_ (utf-8) and dbcs_ - * (DBCS). - * The value is set in mb_init(); - */ -/* length of char in bytes, including following composing chars */ -EXTERN int (*mb_ptr2len)(const char_u *p) INIT(= latin_ptr2len); -/* idem, with limit on string length */ -EXTERN int (*mb_ptr2len_len)(const char_u *p, int size) INIT(= latin_ptr2len_len); -/* byte length of char */ -EXTERN int (*mb_char2len)(int c) INIT(= latin_char2len); -/* convert char to bytes, return the length */ -EXTERN int (*mb_char2bytes)(int c, char_u *buf) INIT(= latin_char2bytes); -EXTERN int (*mb_ptr2cells)(const char_u *p) INIT(= latin_ptr2cells); -EXTERN int (*mb_ptr2cells_len)(const char_u *p, int size) INIT( -      = latin_ptr2cells_len); -EXTERN int (*mb_char2cells)(int c) INIT(= latin_char2cells); -EXTERN int (*mb_off2cells)(unsigned off, unsigned max_off) INIT( -      = latin_off2cells); -EXTERN int (*mb_ptr2char)(const char_u *p) INIT(= latin_ptr2char); -EXTERN int (*mb_head_off)(const char_u *base, const char_u *p) INIT(= latin_head_off); +// To speed up BYTELEN() we keep a table with the byte lengths for utf-8 +EXTERN char utf8len_tab[256];  # if defined(USE_ICONV) && defined(DYNAMIC_ICONV)  /* Pointers to functions and variables to be loaded at runtime */ diff --git a/src/nvim/macros.h b/src/nvim/macros.h index 503daa9648..79e545771e 100644 --- a/src/nvim/macros.h +++ b/src/nvim/macros.h @@ -122,32 +122,29 @@  /* Whether to draw the vertical bar on the right side of the cell. */  # define CURSOR_BAR_RIGHT (curwin->w_p_rl && (!(State & CMDLINE) || cmdmsg_rl)) -/* - * mb_ptr_adv(): advance a pointer to the next character, taking care of - * multi-byte characters if needed. - * mb_ptr_back(): backup a pointer to the previous character, taking care of - * multi-byte characters if needed. - * MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers. - * PTR2CHAR(): get character from pointer. - */ -/* Get the length of the character p points to */ -# define MB_PTR2LEN(p)          (has_mbyte ? (*mb_ptr2len)(p) : 1) -/* Advance multi-byte pointer, skip over composing chars. */ -# define mb_ptr_adv(p)      (p += has_mbyte ? (*mb_ptr2len)((char_u *)p) : 1) -/* Advance multi-byte pointer, do not skip over composing chars. */ -# define mb_cptr_adv(p)     (p += \ -  enc_utf8 ? utf_ptr2len(p) : has_mbyte ? (*mb_ptr2len)(p) : 1) -/* Backup multi-byte pointer. Only use with "p" > "s" ! */ -# define mb_ptr_back(s, p)  (p -= has_mbyte ? ((*mb_head_off)((char_u *)s, (char_u *)p - 1) + 1) : 1) -/* get length of multi-byte char, not including composing chars */ -# define mb_cptr2len(p)     (enc_utf8 ? utf_ptr2len(p) : (*mb_ptr2len)(p)) - -# define MB_COPY_CHAR(f, t) \ -  if (has_mbyte) mb_copy_char((const char_u **)(&f), &t); \ -  else *t++ = *f++ -# define MB_CHARLEN(p)      (has_mbyte ? mb_charlen(p) : (int)STRLEN(p)) -# define MB_CHAR2LEN(c)     (has_mbyte ? mb_char2len(c) : 1) -# define PTR2CHAR(p)        (has_mbyte ? mb_ptr2char(p) : (int)*(p)) +// mb_ptr_adv(): advance a pointer to the next character, taking care of +// multi-byte characters if needed. +// mb_ptr_back(): backup a pointer to the previous character, taking care of +// multi-byte characters if needed. +// MB_COPY_CHAR(f, t): copy one char from "f" to "t" and advance the pointers. +// PTR2CHAR(): get character from pointer. + +// Get the length of the character p points to +# define MB_PTR2LEN(p)          mb_ptr2len(p) +// Advance multi-byte pointer, skip over composing chars. +# define mb_ptr_adv(p)      (p += mb_ptr2len((char_u *)p)) +// Advance multi-byte pointer, do not skip over composing chars. +# define mb_cptr_adv(p)     (p += utf_ptr2len(p)) +// Backup multi-byte pointer. Only use with "p" > "s" ! +# define mb_ptr_back(s, p)  (p -= mb_head_off((char_u *)s, (char_u *)p - 1) + 1) +// get length of multi-byte char, not including composing chars +# define mb_cptr2len(p)     utf_ptr2len(p) + +# define MB_COPY_CHAR(f, t) mb_copy_char((const char_u **)(&f), &t); + +# define MB_CHARLEN(p)      mb_charlen(p) +# define MB_CHAR2LEN(c)     mb_char2len(c) +# define PTR2CHAR(p)        mb_ptr2char(p)  # define RESET_BINDING(wp)  (wp)->w_p_scb = FALSE; (wp)->w_p_crb = FALSE diff --git a/src/nvim/main.c b/src/nvim/main.c index eb67483d08..ffd9353252 100644 --- a/src/nvim/main.c +++ b/src/nvim/main.c @@ -177,7 +177,6 @@ void early_init(void)    fs_init();    handle_init(); -  (void)mb_init();      // init mb_bytelen_tab[] to ones    eval_init();          // init global variables    // Init the table of Normal mode commands. diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index e6312f9c00..7be0be7106 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -1,68 +1,27 @@ -/* - * mbyte.c: Code specifically for handling multi-byte characters. - * Multibyte extensions partly by Sung-Hoon Baek - * - * The encoding used in the core is set with 'encoding'.  When 'encoding' is - * changed, the following four variables are set (for speed). - * Currently these types of character encodings are supported: - * - * "enc_dbcs"	    When non-zero it tells the type of double byte character - *		    encoding (Chinese, Korean, Japanese, etc.). - *		    The cell width on the display is equal to the number of - *		    bytes.  (exception: DBCS_JPNU with first byte 0x8e) - *		    Recognizing the first or second byte is difficult, it - *		    requires checking a byte sequence from the start. - * "enc_utf8"	    When TRUE use Unicode characters in UTF-8 encoding. - *		    The cell width on the display needs to be determined from - *		    the character value. - *		    Recognizing bytes is easy: 0xxx.xxxx is a single-byte - *		    char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading - *		    byte of a multi-byte character. - *		    To make things complicated, up to six composing characters - *		    are allowed.  These are drawn on top of the first char. - *		    For most editing the sequence of bytes with composing - *		    characters included is considered to be one character. - * "enc_unicode"    When 2 use 16-bit Unicode characters (or UTF-16). - *		    When 4 use 32-but Unicode characters. - *		    Internally characters are stored in UTF-8 encoding to - *		    avoid NUL bytes.  Conversion happens when doing I/O. - *		    "enc_utf8" will also be TRUE. - * - * "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero. - * - * If none of these is TRUE, 8-bit bytes are used for a character.  The - * encoding isn't currently specified (TODO). - * - * 'encoding' specifies the encoding used in the core.  This is in registers, - * text manipulation, buffers, etc.  Conversion has to be done when characters - * in another encoding are received or send: - * - *		       clipboard - *			   ^ - *			   | (2) - *			   V - *		   +---------------+ - *	      (1)  |		   | (3) - *  keyboard ----->|	 core	   |-----> display - *		   |		   | - *		   +---------------+ - *			   ^ - *			   | (4) - *			   V - *			 file - * - * (1) Typed characters arrive in the current locale. - * (2) Text will be made available with the encoding specified with - *     'encoding'.  If this is not sufficient, system-specific conversion - *     might be required. - * (3) For the GUI the correct font must be selected, no conversion done. - * (4) The encoding of the file is specified with 'fileencoding'.  Conversion - *     is to be done when it's different from 'encoding'. - * - * The ShaDa file is a special case: Only text is converted, not file names. - * Vim scripts may contain an ":encoding" command.  This has an effect for - * some commands, like ":menutrans" - */ +/// mbyte.c: Code specifically for handling multi-byte characters. +/// Multibyte extensions partly by Sung-Hoon Baek +/// +/// The encoding used in nvim is always UTF-8. "enc_utf8" and "has_mbyte" is +/// thus always true. "enc_dbcs" is always zero. The 'encoding' option is +/// read-only and always reads "utf-8". +/// +/// The cell width on the display needs to be determined from the character +/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char, +/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte +/// character. To make things complicated, up to six composing characters +/// are allowed. These are drawn on top of the first char. For most editing +/// the sequence of bytes with composing characters included is considered to +/// be one character. +/// +/// UTF-8 is used everywhere in the core. This is in registers, text +/// manipulation, buffers, etc. Nvim core communicates with external plugins +/// and GUIs in this encoding. +/// +/// The encoding of a file is specified with 'fileencoding'.  Conversion +/// is to be done when it's different from "utf-8". +/// +/// Vim scripts may contain an ":scriptencoding" command. This has an effect +/// for some commands, like ":menutrans".  #include <inttypes.h>  #include <stdbool.h> @@ -115,7 +74,7 @@ struct interval {   * Bytes which are illegal when used as the first byte have a 1.   * The NUL byte has length 1.   */ -static char utf8len_tab[256] = +char utf8len_tab[256] =  {    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -385,207 +344,6 @@ int enc_canon_props(const char_u *name)  }  /* - * Set up for using multi-byte characters. - * Called in three cases: - * - by main() to initialize (p_enc == NULL) - * - by set_init_1() after 'encoding' was set to its default. - * - by do_set() when 'encoding' has been set. - * p_enc must have been passed through enc_canonize() already. - * Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags. - * Fills mb_bytelen_tab[] and returns NULL when there are no problems. - * When there is something wrong: Returns an error message and doesn't change - * anything. - */ -char_u * mb_init(void) -{ -  int i; -  int idx; -  int n; -  int enc_dbcs_new = 0; -#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \ -  && !defined(MACOS) -# define LEN_FROM_CONV -  vimconv_T vimconv; -  char_u      *p; -#endif - -  if (p_enc == NULL) { -    /* Just starting up: set the whole table to one's. */ -    for (i = 0; i < 256; ++i) -      mb_bytelen_tab[i] = 1; -    return NULL; -  } else if (STRNCMP(p_enc, "8bit-", 5) == 0 -      || STRNCMP(p_enc, "iso-8859-", 9) == 0) { -    /* Accept any "8bit-" or "iso-8859-" name. */ -    enc_unicode = 0; -    enc_utf8 = false; -  } else if (STRNCMP(p_enc, "2byte-", 6) == 0) { -    /* Unix: accept any "2byte-" name, assume current locale. */ -    enc_dbcs_new = DBCS_2BYTE; -  } else if ((idx = enc_canon_search(p_enc)) >= 0) { -    i = enc_canon_table[idx].prop; -    if (i & ENC_UNICODE) { -      /* Unicode */ -      enc_utf8 = true; -      if (i & (ENC_2BYTE | ENC_2WORD)) -        enc_unicode = 2; -      else if (i & ENC_4BYTE) -        enc_unicode = 4; -      else -        enc_unicode = 0; -    } else if (i & ENC_DBCS) { -      /* 2byte, handle below */ -      enc_dbcs_new = enc_canon_table[idx].codepage; -    } else { -      /* Must be 8-bit. */ -      enc_unicode = 0; -      enc_utf8 = false; -    } -  } else    /* Don't know what encoding this is, reject it. */ -    return e_invarg; - -  if (enc_dbcs_new != 0) { -    enc_unicode = 0; -    enc_utf8 = false; -  } -  enc_dbcs = enc_dbcs_new; -  has_mbyte = (enc_dbcs != 0 || enc_utf8); - - -  /* Detect an encoding that uses latin1 characters. */ -  enc_latin1like = (enc_utf8 || STRCMP(p_enc, "latin1") == 0 -      || STRCMP(p_enc, "iso-8859-15") == 0); - -  /* -   * Set the function pointers. -   */ -  if (enc_utf8) { -    mb_ptr2len = utfc_ptr2len; -    mb_ptr2len_len = utfc_ptr2len_len; -    mb_char2len = utf_char2len; -    mb_char2bytes = utf_char2bytes; -    mb_ptr2cells = utf_ptr2cells; -    mb_ptr2cells_len = utf_ptr2cells_len; -    mb_char2cells = utf_char2cells; -    mb_off2cells = utf_off2cells; -    mb_ptr2char = utf_ptr2char; -    mb_head_off = utf_head_off; -  } else if (enc_dbcs != 0) { -    mb_ptr2len = dbcs_ptr2len; -    mb_ptr2len_len = dbcs_ptr2len_len; -    mb_char2len = dbcs_char2len; -    mb_char2bytes = dbcs_char2bytes; -    mb_ptr2cells = dbcs_ptr2cells; -    mb_ptr2cells_len = dbcs_ptr2cells_len; -    mb_char2cells = dbcs_char2cells; -    mb_off2cells = dbcs_off2cells; -    mb_ptr2char = dbcs_ptr2char; -    mb_head_off = dbcs_head_off; -  } else { -    mb_ptr2len = latin_ptr2len; -    mb_ptr2len_len = latin_ptr2len_len; -    mb_char2len = latin_char2len; -    mb_char2bytes = latin_char2bytes; -    mb_ptr2cells = latin_ptr2cells; -    mb_ptr2cells_len = latin_ptr2cells_len; -    mb_char2cells = latin_char2cells; -    mb_off2cells = latin_off2cells; -    mb_ptr2char = latin_ptr2char; -    mb_head_off = latin_head_off; -  } - -  /* -   * Fill the mb_bytelen_tab[] for MB_BYTE2LEN(). -   */ -#ifdef LEN_FROM_CONV -  /* When 'encoding' is different from the current locale mblen() won't -   * work.  Use conversion to "utf-8" instead. */ -  vimconv.vc_type = CONV_NONE; -  if (enc_dbcs) { -    p = enc_locale(); -    if (p == NULL || STRCMP(p, p_enc) != 0) { -      convert_setup(&vimconv, p_enc, (char_u *)"utf-8"); -      vimconv.vc_fail = true; -    } -    xfree(p); -  } -#endif - -  for (i = 0; i < 256; ++i) { -    /* Our own function to reliably check the length of UTF-8 characters, -     * independent of mblen(). */ -    if (enc_utf8) -      n = utf8len_tab[i]; -    else if (enc_dbcs == 0) -      n = 1; -    else { -      char buf[MB_MAXBYTES + 1]; -      if (i == NUL)             /* just in case mblen() can't handle "" */ -        n = 1; -      else { -        buf[0] = i; -        buf[1] = 0; -#ifdef LEN_FROM_CONV -        if (vimconv.vc_type != CONV_NONE) { -          /* -           * string_convert() should fail when converting the first -           * byte of a double-byte character. -           */ -          p = string_convert(&vimconv, (char_u *)buf, NULL); -          if (p != NULL) { -            xfree(p); -            n = 1; -          } else -            n = 2; -        } else -#endif -        { -          /* -           * mblen() should return -1 for invalid (means the leading -           * multibyte) character.  However there are some platforms -           * where mblen() returns 0 for invalid character. -           * Therefore, following condition includes 0. -           */ -          ignored = mblen(NULL, 0);             /* First reset the state. */ -          if (mblen(buf, (size_t)1) <= 0) -            n = 2; -          else -            n = 1; -        } -      } -    } -    mb_bytelen_tab[i] = n; -  } - -#ifdef LEN_FROM_CONV -  convert_setup(&vimconv, NULL, NULL); -#endif - -  /* The cell width depends on the type of multi-byte characters. */ -  (void)init_chartab(); - -  /* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */ -  screenalloc(false); - -#ifdef HAVE_WORKING_LIBINTL -  /* GNU gettext 0.10.37 supports this feature: set the codeset used for -   * translated messages independently from the current locale. */ -  (void)bind_textdomain_codeset(PROJECT_NAME, -                                enc_utf8 ? "utf-8" : (char *)p_enc); -#endif - - -  /* Fire an autocommand to let people do custom font setup. This must be -   * after Vim has been setup for the new encoding. */ -  apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf); - -  /* Need to reload spell dictionaries */ -  spell_reload(); - -  return NULL; -} - -/*   * Return the size of the BOM for the current buffer:   * 0 - no BOM   * 2 - UCS-2 or UTF-16 BOM @@ -597,20 +355,15 @@ int bomb_size(void)    int n = 0;    if (curbuf->b_p_bomb && !curbuf->b_p_bin) { -    if (*curbuf->b_p_fenc == NUL) { -      if (enc_utf8) { -        if (enc_unicode != 0) -          n = enc_unicode; -        else -          n = 3; -      } -    } else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0) +    if (*curbuf->b_p_fenc == NUL +        || STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {        n = 3; -    else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0 -        || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) +    } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0 +               || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {        n = 2; -    else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) +    } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {        n = 4; +    }    }    return n;  } @@ -804,99 +557,6 @@ int dbcs_class(unsigned lead, unsigned trail)  }  /* - * mb_char2len() function pointer. - * Return length in bytes of character "c". - * Returns 1 for a single-byte character. - */ -int latin_char2len(int c) -{ -  return 1; -} - -static int dbcs_char2len(int c) -{ -  if (c >= 0x100) -    return 2; -  return 1; -} - -/* - * mb_char2bytes() function pointer. - * Convert a character to its bytes. - * Returns the length in bytes. - */ -int latin_char2bytes(int c, char_u *buf) -{ -  buf[0] = c; -  return 1; -} - -static int dbcs_char2bytes(int c, char_u *buf) -{ -  if (c >= 0x100) { -    buf[0] = (unsigned)c >> 8; -    buf[1] = c; -    /* Never use a NUL byte, it causes lots of trouble.  It's an invalid -     * character anyway. */ -    if (buf[1] == NUL) -      buf[1] = '\n'; -    return 2; -  } -  buf[0] = c; -  return 1; -} - -/* - * mb_ptr2len() function pointer. - * Get byte length of character at "*p" but stop at a NUL. - * For UTF-8 this includes following composing characters. - * Returns 0 when *p is NUL. - */ -int latin_ptr2len(const char_u *p) -{ -  return MB_BYTE2LEN(*p); -} - -static int dbcs_ptr2len(const char_u *p) -{ -  int len; - -  /* Check if second byte is not missing. */ -  len = MB_BYTE2LEN(*p); -  if (len == 2 && p[1] == NUL) -    len = 1; -  return len; -} - -/* - * mb_ptr2len_len() function pointer. - * Like mb_ptr2len(), but limit to read "size" bytes. - * Returns 0 for an empty string. - * Returns 1 for an illegal char or an incomplete byte sequence. - */ -int latin_ptr2len_len(const char_u *p, int size) -{ -  if (size < 1 || *p == NUL) -    return 0; -  return 1; -} - -static int dbcs_ptr2len_len(const char_u *p, int size) -{ -  int len; - -  if (size < 1 || *p == NUL) -    return 0; -  if (size == 1) -    return 1; -  /* Check that second byte is not missing. */ -  len = MB_BYTE2LEN(*p); -  if (len == 2 && p[1] == NUL) -    len = 1; -  return len; -} - -/*   * Return true if "c" is in "table".   */  static bool intable(const struct interval *table, size_t n_items, int c) @@ -963,16 +623,8 @@ int utf_char2cells(int c)    return 1;  } -/* - * mb_ptr2cells() function pointer. - * Return the number of display cells character at "*p" occupies. - * This doesn't take care of unprintable characters, use ptr2cells() for that. - */ -int latin_ptr2cells(const char_u *p) -{ -  return 1; -} - +/// Return the number of display cells character at "*p" occupies. +/// This doesn't take care of unprintable characters, use ptr2cells() for that.  int utf_ptr2cells(const char_u *p)  {    int c; @@ -991,26 +643,9 @@ int utf_ptr2cells(const char_u *p)    return 1;  } -int dbcs_ptr2cells(const char_u *p) -{ -  /* Number of cells is equal to number of bytes, except for euc-jp when -   * the first byte is 0x8e. */ -  if (enc_dbcs == DBCS_JPNU && *p == 0x8e) -    return 1; -  return MB_BYTE2LEN(*p); -} - -/* - * mb_ptr2cells_len() function pointer. - * Like mb_ptr2cells(), but limit string length to "size". - * For an empty string or truncated character returns 1. - */ -int latin_ptr2cells_len(const char_u *p, int size) -{ -  return 1; -} - -static int utf_ptr2cells_len(const char_u *p, int size) +/// Like utf_ptr2cells(), but limit string length to "size". +/// For an empty string or truncated character returns 1. +int utf_ptr2cells_len(const char_u *p, int size)  {    int c; @@ -1030,35 +665,6 @@ static int utf_ptr2cells_len(const char_u *p, int size)    return 1;  } -static int dbcs_ptr2cells_len(const char_u *p, int size) -{ -  /* Number of cells is equal to number of bytes, except for euc-jp when -   * the first byte is 0x8e. */ -  if (size <= 1 || (enc_dbcs == DBCS_JPNU && *p == 0x8e)) -    return 1; -  return MB_BYTE2LEN(*p); -} - -/* - * mb_char2cells() function pointer. - * Return the number of display cells character "c" occupies. - * Only takes care of multi-byte chars, not "^C" and such. - */ -int latin_char2cells(int c) -{ -  return 1; -} - -static int dbcs_char2cells(int c) -{ -  /* Number of cells is equal to number of bytes, except for euc-jp when -   * the first byte is 0x8e. */ -  if (enc_dbcs == DBCS_JPNU && ((unsigned)c >> 8) == 0x8e) -    return 1; -  /* use the first byte */ -  return MB_BYTE2LEN((unsigned)c >> 8); -} -  /// Calculate the number of cells occupied by string `str`.  ///  /// @param str The source string, may not be NULL, must be a NUL-terminated @@ -1075,51 +681,14 @@ size_t mb_string2cells(const char_u *str)    return clen;  } -/* - * mb_off2cells() function pointer. - * Return number of display cells for char at ScreenLines[off]. - * We make sure that the offset used is less than "max_off". - */ -int latin_off2cells(unsigned off, unsigned max_off) -{ -  return 1; -} - -int dbcs_off2cells(unsigned off, unsigned max_off) -{ -  /* never check beyond end of the line */ -  if (off >= max_off) -    return 1; - -  /* Number of cells is equal to number of bytes, except for euc-jp when -   * the first byte is 0x8e. */ -  if (enc_dbcs == DBCS_JPNU && ScreenLines[off] == 0x8e) -    return 1; -  return MB_BYTE2LEN(ScreenLines[off]); -} - +/// Return number of display cells for char at ScreenLines[off]. +/// We make sure that the offset used is less than "max_off".  int utf_off2cells(unsigned off, unsigned max_off)  {    return (off + 1 < max_off && ScreenLines[off + 1] == 0) ? 2 : 1;  }  /* - * mb_ptr2char() function pointer. - * Convert a byte sequence into a character. - */ -int latin_ptr2char(const char_u *p) -{ -  return *p; -} - -static int dbcs_ptr2char(const char_u *p) -{ -  if (MB_BYTE2LEN(*p) > 1 && p[1] != NUL) -    return (p[0] << 8) + p[1]; -  return *p; -} - -/*   * Convert a UTF-8 byte sequence to a wide character.   * If the sequence is illegal or truncated by a NUL the first byte is   * returned. @@ -2065,68 +1634,9 @@ void show_utf8(void)    msg(IObuff);  } -/* - * mb_head_off() function pointer. - * Return offset from "p" to the first byte of the character it points into. - * If "p" points to the NUL at the end of the string return 0. - * Returns 0 when already at the first byte of a character. - */ -int latin_head_off(const char_u *base, const char_u *p) -{ -  return 0; -} - -int dbcs_head_off(const char_u *base, const char_u *p) -{ -  /* It can't be a trailing byte when not using DBCS, at the start of the -   * string or the previous byte can't start a double-byte. */ -  if (p <= base || MB_BYTE2LEN(p[-1]) == 1 || *p == NUL) { -    return 0; -  } - -  /* This is slow: need to start at the base and go forward until the -   * byte we are looking for.  Return 1 when we went past it, 0 otherwise. */ -  const char_u *q = base; -  while (q < p) { -    q += dbcs_ptr2len(q); -  } - -  return (q == p) ? 0 : 1; -} - -/* - * Special version of dbcs_head_off() that works for ScreenLines[], where - * single-width DBCS_JPNU characters are stored separately. - */ -int dbcs_screen_head_off(const char_u *base, const char_u *p) -{ -  /* It can't be a trailing byte when not using DBCS, at the start of the -   * string or the previous byte can't start a double-byte. -   * For euc-jp an 0x8e byte in the previous cell always means we have a -   * lead byte in the current cell. */ -  if (p <= base -      || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e) -      || MB_BYTE2LEN(p[-1]) == 1 -      || *p == NUL) -    return 0; - -  /* This is slow: need to start at the base and go forward until the -   * byte we are looking for.  Return 1 when we went past it, 0 otherwise. -   * For DBCS_JPNU look out for 0x8e, which means the second byte is not -   * stored as the next byte. */ -  const char_u *q = base; -  while (q < p) { -    if (enc_dbcs == DBCS_JPNU && *q == 0x8e) { -      ++q; -    } -    else { -      q += dbcs_ptr2len(q); -    } -  } - -  return (q == p) ? 0 : 1; -} - +/// Return offset from "p" to the first byte of the character it points into. +/// If "p" points to the NUL at the end of the string return 0. +/// Returns 0 when already at the first byte of a character.  int utf_head_off(const char_u *base, const char_u *p)  {    int c; @@ -2232,26 +1742,20 @@ int mb_tail_off(char_u *base, char_u *p)    if (*p == NUL)      return 0; -  if (enc_utf8) { -    /* Find the last character that is 10xx.xxxx */ -    for (i = 0; (p[i + 1] & 0xc0) == 0x80; ++i) -      ; -    /* Check for illegal sequence. */ -    for (j = 0; p - j > base; ++j) -      if ((p[-j] & 0xc0) != 0x80) -        break; -    if (utf8len_tab[p[-j]] != i + j + 1) -      return 0; -    return i; +  // Find the last character that is 10xx.xxxx +  for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {} + +  // Check for illegal sequence. +  for (j = 0; p - j > base; j++) { +    if ((p[-j] & 0xc0) != 0x80) { +      break; +    }    } -  /* It can't be the first byte if a double-byte when not using DBCS, at the -   * end of the string or the byte can't start a double-byte. */ -  if (enc_dbcs == 0 || p[1] == NUL || MB_BYTE2LEN(*p) == 1) +  if (utf8len_tab[p[-j]] != i + j + 1) {      return 0; - -  /* Return 1 when on the lead byte, 0 when on the tail byte. */ -  return 1 - dbcs_head_off(base, p); +  } +  return i;  }  /* @@ -2466,13 +1970,10 @@ int mb_fix_col(int col, int row)  {    col = check_col(col);    row = check_row(row); -  if (has_mbyte && ScreenLines != NULL && col > 0 -      && ((enc_dbcs -          && ScreenLines[LineOffset[row] + col] != NUL -          && dbcs_screen_head_off(ScreenLines + LineOffset[row], -            ScreenLines + LineOffset[row] + col)) -        || (enc_utf8 && ScreenLines[LineOffset[row] + col] == 0))) +  if (ScreenLines != NULL && col > 0 +      && ScreenLines[LineOffset[row] + col] == 0) {      return col - 1; +  }    return col;  } diff --git a/src/nvim/mbyte.h b/src/nvim/mbyte.h index 0cfe2c4bab..2c92a0fbb2 100644 --- a/src/nvim/mbyte.h +++ b/src/nvim/mbyte.h @@ -9,8 +9,8 @@   * MB_BYTE2LEN_CHECK() can be used to count a special key as one byte.   * Don't call MB_BYTE2LEN(b) with b < 0 or b > 255!   */ -#define MB_BYTE2LEN(b)         mb_bytelen_tab[b] -#define MB_BYTE2LEN_CHECK(b)   (((b) < 0 || (b) > 255) ? 1 : mb_bytelen_tab[b]) +#define MB_BYTE2LEN(b)         utf8len_tab[b] +#define MB_BYTE2LEN_CHECK(b)   (((b) < 0 || (b) > 255) ? 1 : utf8len_tab[b])  /* properties used in enc_canon_table[] (first three mutually exclusive) */  #define ENC_8BIT       0x01 @@ -28,6 +28,18 @@  #define ENC_LATIN9     0x400       /* Latin9 */  #define ENC_MACROMAN   0x800       /* Mac Roman (not Macro Man! :-) */ +// TODO(bfredl): eventually we should keep only one of the namings +#define mb_ptr2len utfc_ptr2len +#define mb_ptr2len_len utfc_ptr2len_len +#define mb_char2len utf_char2len +#define mb_char2bytes utf_char2bytes +#define mb_ptr2cells utf_ptr2cells +#define mb_ptr2cells_len utf_ptr2cells_len +#define mb_char2cells utf_char2cells +#define mb_off2cells utf_off2cells +#define mb_ptr2char utf_ptr2char +#define mb_head_off utf_head_off +  #ifdef INCLUDE_GENERATED_DECLARATIONS  # include "mbyte.h.generated.h"  #endif diff --git a/src/nvim/ops.c b/src/nvim/ops.c index 388a72adce..0263bd15da 100644 --- a/src/nvim/ops.c +++ b/src/nvim/ops.c @@ -1936,8 +1936,7 @@ int swapchar(int op_type, pos_T *pos)    if (c >= 0x80 && op_type == OP_ROT13)      return FALSE; -  if (op_type == OP_UPPER && c == 0xdf -      && (enc_latin1like || STRCMP(p_enc, "iso-8859-2") == 0)) { +  if (op_type == OP_UPPER && c == 0xdf) {      pos_T sp = curwin->w_cursor;      /* Special handling of German sharp s: change to "SS". */ diff --git a/src/nvim/option.c b/src/nvim/option.c index a255165e32..311982982b 100644 --- a/src/nvim/option.c +++ b/src/nvim/option.c @@ -780,14 +780,11 @@ void set_init_1(void)    }    fenc_default = p; -  // Initialize multibyte (utf-8) handling -  mb_init(); - -  // Don't change &encoding when resetting to defaults with ":set all&". -  opt_idx = findoption((char_u *)"encoding"); -  if (opt_idx >= 0) { -    options[opt_idx].flags |= P_NODEFAULT; -  } +#ifdef HAVE_WORKING_LIBINTL +  // GNU gettext 0.10.37 supports this feature: set the codeset used for +  // translated messages independently from the current locale. +  (void)bind_textdomain_codeset(PROJECT_NAME, (char *)p_enc); +#endif    /* Set the default for 'helplang'. */    set_helplang_default(get_mess_lang()); @@ -2580,19 +2577,17 @@ did_set_string_option (        errmsg = e_invarg;    /* 'encoding' and 'fileencoding' */    } else if (varp == &p_enc || gvarp == &p_fenc) { -    if (varp == &p_enc && did_source_startup_scripts) { -       errmsg = e_afterinit; -    } else if (gvarp == &p_fenc) { -      if (!MODIFIABLE(curbuf) && opt_flags != OPT_GLOBAL) +    if (gvarp == &p_fenc) { +      if (!MODIFIABLE(curbuf) && opt_flags != OPT_GLOBAL) {          errmsg = e_modifiable; -      else if (vim_strchr(*varp, ',') != NULL) -        /* No comma allowed in 'fileencoding'; catches confusing it -         * with 'fileencodings'. */ +      } else if (vim_strchr(*varp, ',') != NULL) { +        // No comma allowed in 'fileencoding'; catches confusing it +        // with 'fileencodings'.          errmsg = e_invarg; -      else { -        /* May show a "+" in the title now. */ +      } else { +        // May show a "+" in the title now.          redraw_titles(); -        /* Add 'fileencoding' to the swap file. */ +        // Add 'fileencoding' to the swap file.          ml_setflags(curbuf);        }      } @@ -2603,17 +2598,12 @@ did_set_string_option (        xfree(*varp);        *varp = p;        if (varp == &p_enc) { -        errmsg = mb_init(); -        redraw_titles(); +        // only encoding=utf-8 allowed +        if (STRCMP(p_enc, "utf-8") != 0) { +          errmsg = e_invarg; +        }        }      } - -    if (errmsg == NULL) { -      /* When 'keymap' is used and 'encoding' changes, reload the keymap -       * (with another encoding). */ -      if (varp == &p_enc && *curbuf->b_p_keymap != NUL) -        (void)keymap_init(); -    }    } else if (varp == &p_penc) {      /* Canonize printencoding if VIM standard one */      p = enc_canonize(p_penc); diff --git a/src/nvim/screen.c b/src/nvim/screen.c index 3e4d016fe7..cee3c62f43 100644 --- a/src/nvim/screen.c +++ b/src/nvim/screen.c @@ -5292,7 +5292,7 @@ void screen_puts_len(char_u *text, int textlen, int row, int col, int attr)    int force_redraw_next = FALSE;    int need_redraw; -  const int l_has_mbyte = has_mbyte; +  const bool l_has_mbyte = has_mbyte;    const bool l_enc_utf8 = enc_utf8;    const int l_enc_dbcs = enc_dbcs; @@ -5459,9 +5459,6 @@ void screen_puts_len(char_u *text, int textlen, int row, int col, int attr)    /* If we detected the next character needs to be redrawn, but the text     * doesn't extend up to there, update the character here. */    if (force_redraw_next && col < screen_Columns) { -    if (l_enc_dbcs != 0 && dbcs_off2cells(off, max_off) > 1) -      screen_char_2(off, row, col); -    else        screen_char(off, row, col);    }  } diff --git a/src/nvim/spell.c b/src/nvim/spell.c index ba7f31be25..d9cdce8ca4 100644 --- a/src/nvim/spell.c +++ b/src/nvim/spell.c @@ -9266,9 +9266,7 @@ static void allcap_copy(char_u *word, char_u *wcopy)      else        c = *s++; -    // We only change 0xdf to SS when we are certain latin1 is used.  It -    // would cause weird errors in other 8-bit encodings. -    if (enc_latin1like && c == 0xdf) { +    if (c == 0xdf) {        c = 'S';        if (d - wcopy >= MAXWLEN - 1)          break; @@ -12602,7 +12600,7 @@ static int spell_edit_score(slang_T *slang, char_u *badword, char_u *goodword)    char_u      *p;    int wbadword[MAXWLEN];    int wgoodword[MAXWLEN]; -  const int l_has_mbyte = has_mbyte; +  const bool l_has_mbyte = has_mbyte;    if (l_has_mbyte) {      // Get the characters from the multi-byte strings and put them in an diff --git a/src/nvim/tui/input.c b/src/nvim/tui/input.c index 740716f0ef..9dc66420b0 100644 --- a/src/nvim/tui/input.c +++ b/src/nvim/tui/input.c @@ -31,8 +31,8 @@ void term_input_init(TermInput *input, Loop *loop)    if (!term) {      term = "";  // termkey_new_abstract assumes non-null (#2745)    } -  int enc_flag = enc_utf8 ? TERMKEY_FLAG_UTF8 : TERMKEY_FLAG_RAW; -  input->tk = termkey_new_abstract(term, enc_flag); + +  input->tk = termkey_new_abstract(term, TERMKEY_FLAG_UTF8);    int curflags = termkey_get_canonflags(input->tk);    termkey_set_canonflags(input->tk, curflags | TERMKEY_CANON_DELBS); | 
