Merge pull request #2905 from bfredl/utf8

Only allow encoding=utf-8 and simplify multibyte code
author: Björn Linse <bjorn.linse@gmail.com> 2016-11-05 18:12:14 +0100
committer: GitHub <noreply@github.com> 2016-11-05 18:12:14 +0100
commit: 9147331e212e8d0fff0f30344007faffd42609ee (patch)
tree: 6d1853a72b94ce19ba0cc163c664a9a206da0091 /src/nvim/mbyte.c
parent: 32d9c19e294f38a6adae6e055fc606fc3fd33f2f (diff)
parent: 4ab3fe8eaadb5456eeafc49df2fb0ecf71d836cc (diff)
download: rneovim-9147331e212e8d0fff0f30344007faffd42609ee.tar.gz
rneovim-9147331e212e8d0fff0f30344007faffd42609ee.tar.bz2
rneovim-9147331e212e8d0fff0f30344007faffd42609ee.zip
1 files changed, 55 insertions, 554 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index e6312f9c00..7be0be7106 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -1,68 +1,27 @@
-/*
- * mbyte.c: Code specifically for handling multi-byte characters.
- * Multibyte extensions partly by Sung-Hoon Baek
- *
- * The encoding used in the core is set with 'encoding'.  When 'encoding' is
- * changed, the following four variables are set (for speed).
- * Currently these types of character encodings are supported:
- *
- * "enc_dbcs"	    When non-zero it tells the type of double byte character
- *		    encoding (Chinese, Korean, Japanese, etc.).
- *		    The cell width on the display is equal to the number of
- *		    bytes.  (exception: DBCS_JPNU with first byte 0x8e)
- *		    Recognizing the first or second byte is difficult, it
- *		    requires checking a byte sequence from the start.
- * "enc_utf8"	    When TRUE use Unicode characters in UTF-8 encoding.
- *		    The cell width on the display needs to be determined from
- *		    the character value.
- *		    Recognizing bytes is easy: 0xxx.xxxx is a single-byte
- *		    char, 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading
- *		    byte of a multi-byte character.
- *		    To make things complicated, up to six composing characters
- *		    are allowed.  These are drawn on top of the first char.
- *		    For most editing the sequence of bytes with composing
- *		    characters included is considered to be one character.
- * "enc_unicode"    When 2 use 16-bit Unicode characters (or UTF-16).
- *		    When 4 use 32-but Unicode characters.
- *		    Internally characters are stored in UTF-8 encoding to
- *		    avoid NUL bytes.  Conversion happens when doing I/O.
- *		    "enc_utf8" will also be TRUE.
- *
- * "has_mbyte" is set when "enc_dbcs" or "enc_utf8" is non-zero.
- *
- * If none of these is TRUE, 8-bit bytes are used for a character.  The
- * encoding isn't currently specified (TODO).
- *
- * 'encoding' specifies the encoding used in the core.  This is in registers,
- * text manipulation, buffers, etc.  Conversion has to be done when characters
- * in another encoding are received or send:
- *
- *		       clipboard
- *			   ^
- *			   | (2)
- *			   V
- *		   +---------------+
- *	      (1)  |		   | (3)
- *  keyboard ----->|	 core	   |-----> display
- *		   |		   |
- *		   +---------------+
- *			   ^
- *			   | (4)
- *			   V
- *			 file
- *
- * (1) Typed characters arrive in the current locale.
- * (2) Text will be made available with the encoding specified with
- *     'encoding'.  If this is not sufficient, system-specific conversion
- *     might be required.
- * (3) For the GUI the correct font must be selected, no conversion done.
- * (4) The encoding of the file is specified with 'fileencoding'.  Conversion
- *     is to be done when it's different from 'encoding'.
- *
- * The ShaDa file is a special case: Only text is converted, not file names.
- * Vim scripts may contain an ":encoding" command.  This has an effect for
- * some commands, like ":menutrans"
- */
+/// mbyte.c: Code specifically for handling multi-byte characters.
+/// Multibyte extensions partly by Sung-Hoon Baek
+///
+/// The encoding used in nvim is always UTF-8. "enc_utf8" and "has_mbyte" is
+/// thus always true. "enc_dbcs" is always zero. The 'encoding' option is
+/// read-only and always reads "utf-8".
+///
+/// The cell width on the display needs to be determined from the character
+/// value. Recognizing UTF-8 bytes is easy: 0xxx.xxxx is a single-byte char,
+/// 10xx.xxxx is a trailing byte, 11xx.xxxx is a leading byte of a multi-byte
+/// character. To make things complicated, up to six composing characters
+/// are allowed. These are drawn on top of the first char. For most editing
+/// the sequence of bytes with composing characters included is considered to
+/// be one character.
+///
+/// UTF-8 is used everywhere in the core. This is in registers, text
+/// manipulation, buffers, etc. Nvim core communicates with external plugins
+/// and GUIs in this encoding.
+///
+/// The encoding of a file is specified with 'fileencoding'.  Conversion
+/// is to be done when it's different from "utf-8".
+///
+/// Vim scripts may contain an ":scriptencoding" command. This has an effect
+/// for some commands, like ":menutrans".
 
 #include <inttypes.h>
 #include <stdbool.h>
@@ -115,7 +74,7 @@ struct interval {
  * Bytes which are illegal when used as the first byte have a 1.
  * The NUL byte has length 1.
  */
-static char utf8len_tab[256] =
+char utf8len_tab[256] =
 {
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -385,207 +344,6 @@ int enc_canon_props(const char_u *name)
 }
 
 /*
- * Set up for using multi-byte characters.
- * Called in three cases:
- * - by main() to initialize (p_enc == NULL)
- * - by set_init_1() after 'encoding' was set to its default.
- * - by do_set() when 'encoding' has been set.
- * p_enc must have been passed through enc_canonize() already.
- * Sets the "enc_unicode", "enc_utf8", "enc_dbcs" and "has_mbyte" flags.
- * Fills mb_bytelen_tab[] and returns NULL when there are no problems.
- * When there is something wrong: Returns an error message and doesn't change
- * anything.
- */
-char_u * mb_init(void)
-{
-  int i;
-  int idx;
-  int n;
-  int enc_dbcs_new = 0;
-#if defined(USE_ICONV) && !defined(WIN3264) && !defined(WIN32UNIX) \
-  && !defined(MACOS)
-# define LEN_FROM_CONV
-  vimconv_T vimconv;
-  char_u      *p;
-#endif
-
-  if (p_enc == NULL) {
-    /* Just starting up: set the whole table to one's. */
-    for (i = 0; i < 256; ++i)
-      mb_bytelen_tab[i] = 1;
-    return NULL;
-  } else if (STRNCMP(p_enc, "8bit-", 5) == 0
-      || STRNCMP(p_enc, "iso-8859-", 9) == 0) {
-    /* Accept any "8bit-" or "iso-8859-" name. */
-    enc_unicode = 0;
-    enc_utf8 = false;
-  } else if (STRNCMP(p_enc, "2byte-", 6) == 0) {
-    /* Unix: accept any "2byte-" name, assume current locale. */
-    enc_dbcs_new = DBCS_2BYTE;
-  } else if ((idx = enc_canon_search(p_enc)) >= 0) {
-    i = enc_canon_table[idx].prop;
-    if (i & ENC_UNICODE) {
-      /* Unicode */
-      enc_utf8 = true;
-      if (i & (ENC_2BYTE | ENC_2WORD))
-        enc_unicode = 2;
-      else if (i & ENC_4BYTE)
-        enc_unicode = 4;
-      else
-        enc_unicode = 0;
-    } else if (i & ENC_DBCS) {
-      /* 2byte, handle below */
-      enc_dbcs_new = enc_canon_table[idx].codepage;
-    } else {
-      /* Must be 8-bit. */
-      enc_unicode = 0;
-      enc_utf8 = false;
-    }
-  } else    /* Don't know what encoding this is, reject it. */
-    return e_invarg;
-
-  if (enc_dbcs_new != 0) {
-    enc_unicode = 0;
-    enc_utf8 = false;
-  }
-  enc_dbcs = enc_dbcs_new;
-  has_mbyte = (enc_dbcs != 0 || enc_utf8);
-
-
-  /* Detect an encoding that uses latin1 characters. */
-  enc_latin1like = (enc_utf8 || STRCMP(p_enc, "latin1") == 0
-      || STRCMP(p_enc, "iso-8859-15") == 0);
-
-  /*
-   * Set the function pointers.
-   */
-  if (enc_utf8) {
-    mb_ptr2len = utfc_ptr2len;
-    mb_ptr2len_len = utfc_ptr2len_len;
-    mb_char2len = utf_char2len;
-    mb_char2bytes = utf_char2bytes;
-    mb_ptr2cells = utf_ptr2cells;
-    mb_ptr2cells_len = utf_ptr2cells_len;
-    mb_char2cells = utf_char2cells;
-    mb_off2cells = utf_off2cells;
-    mb_ptr2char = utf_ptr2char;
-    mb_head_off = utf_head_off;
-  } else if (enc_dbcs != 0) {
-    mb_ptr2len = dbcs_ptr2len;
-    mb_ptr2len_len = dbcs_ptr2len_len;
-    mb_char2len = dbcs_char2len;
-    mb_char2bytes = dbcs_char2bytes;
-    mb_ptr2cells = dbcs_ptr2cells;
-    mb_ptr2cells_len = dbcs_ptr2cells_len;
-    mb_char2cells = dbcs_char2cells;
-    mb_off2cells = dbcs_off2cells;
-    mb_ptr2char = dbcs_ptr2char;
-    mb_head_off = dbcs_head_off;
-  } else {
-    mb_ptr2len = latin_ptr2len;
-    mb_ptr2len_len = latin_ptr2len_len;
-    mb_char2len = latin_char2len;
-    mb_char2bytes = latin_char2bytes;
-    mb_ptr2cells = latin_ptr2cells;
-    mb_ptr2cells_len = latin_ptr2cells_len;
-    mb_char2cells = latin_char2cells;
-    mb_off2cells = latin_off2cells;
-    mb_ptr2char = latin_ptr2char;
-    mb_head_off = latin_head_off;
-  }
-
-  /*
-   * Fill the mb_bytelen_tab[] for MB_BYTE2LEN().
-   */
-#ifdef LEN_FROM_CONV
-  /* When 'encoding' is different from the current locale mblen() won't
-   * work.  Use conversion to "utf-8" instead. */
-  vimconv.vc_type = CONV_NONE;
-  if (enc_dbcs) {
-    p = enc_locale();
-    if (p == NULL || STRCMP(p, p_enc) != 0) {
-      convert_setup(&vimconv, p_enc, (char_u *)"utf-8");
-      vimconv.vc_fail = true;
-    }
-    xfree(p);
-  }
-#endif
-
-  for (i = 0; i < 256; ++i) {
-    /* Our own function to reliably check the length of UTF-8 characters,
-     * independent of mblen(). */
-    if (enc_utf8)
-      n = utf8len_tab[i];
-    else if (enc_dbcs == 0)
-      n = 1;
-    else {
-      char buf[MB_MAXBYTES + 1];
-      if (i == NUL)             /* just in case mblen() can't handle "" */
-        n = 1;
-      else {
-        buf[0] = i;
-        buf[1] = 0;
-#ifdef LEN_FROM_CONV
-        if (vimconv.vc_type != CONV_NONE) {
-          /*
-           * string_convert() should fail when converting the first
-           * byte of a double-byte character.
-           */
-          p = string_convert(&vimconv, (char_u *)buf, NULL);
-          if (p != NULL) {
-            xfree(p);
-            n = 1;
-          } else
-            n = 2;
-        } else
-#endif
-        {
-          /*
-           * mblen() should return -1 for invalid (means the leading
-           * multibyte) character.  However there are some platforms
-           * where mblen() returns 0 for invalid character.
-           * Therefore, following condition includes 0.
-           */
-          ignored = mblen(NULL, 0);             /* First reset the state. */
-          if (mblen(buf, (size_t)1) <= 0)
-            n = 2;
-          else
-            n = 1;
-        }
-      }
-    }
-    mb_bytelen_tab[i] = n;
-  }
-
-#ifdef LEN_FROM_CONV
-  convert_setup(&vimconv, NULL, NULL);
-#endif
-
-  /* The cell width depends on the type of multi-byte characters. */
-  (void)init_chartab();
-
-  /* When enc_utf8 is set or reset, (de)allocate ScreenLinesUC[] */
-  screenalloc(false);
-
-#ifdef HAVE_WORKING_LIBINTL
-  /* GNU gettext 0.10.37 supports this feature: set the codeset used for
-   * translated messages independently from the current locale. */
-  (void)bind_textdomain_codeset(PROJECT_NAME,
-                                enc_utf8 ? "utf-8" : (char *)p_enc);
-#endif
-
-
-  /* Fire an autocommand to let people do custom font setup. This must be
-   * after Vim has been setup for the new encoding. */
-  apply_autocmds(EVENT_ENCODINGCHANGED, NULL, (char_u *)"", FALSE, curbuf);
-
-  /* Need to reload spell dictionaries */
-  spell_reload();
-
-  return NULL;
-}
-
-/*
  * Return the size of the BOM for the current buffer:
  * 0 - no BOM
  * 2 - UCS-2 or UTF-16 BOM
@@ -597,20 +355,15 @@ int bomb_size(void)
   int n = 0;
 
   if (curbuf->b_p_bomb && !curbuf->b_p_bin) {
-    if (*curbuf->b_p_fenc == NUL) {
-      if (enc_utf8) {
-        if (enc_unicode != 0)
-          n = enc_unicode;
-        else
-          n = 3;
-      }
-    } else if (STRCMP(curbuf->b_p_fenc, "utf-8") == 0)
+    if (*curbuf->b_p_fenc == NUL
+        || STRCMP(curbuf->b_p_fenc, "utf-8") == 0) {
       n = 3;
-    else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
-        || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0)
+    } else if (STRNCMP(curbuf->b_p_fenc, "ucs-2", 5) == 0
+               || STRNCMP(curbuf->b_p_fenc, "utf-16", 6) == 0) {
       n = 2;
-    else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0)
+    } else if (STRNCMP(curbuf->b_p_fenc, "ucs-4", 5) == 0) {
       n = 4;
+    }
   }
   return n;
 }
@@ -804,99 +557,6 @@ int dbcs_class(unsigned lead, unsigned trail)
 }
 
 /*
- * mb_char2len() function pointer.
- * Return length in bytes of character "c".
- * Returns 1 for a single-byte character.
- */
-int latin_char2len(int c)
-{
-  return 1;
-}
-
-static int dbcs_char2len(int c)
-{
-  if (c >= 0x100)
-    return 2;
-  return 1;
-}
-
-/*
- * mb_char2bytes() function pointer.
- * Convert a character to its bytes.
- * Returns the length in bytes.
- */
-int latin_char2bytes(int c, char_u *buf)
-{
-  buf[0] = c;
-  return 1;
-}
-
-static int dbcs_char2bytes(int c, char_u *buf)
-{
-  if (c >= 0x100) {
-    buf[0] = (unsigned)c >> 8;
-    buf[1] = c;
-    /* Never use a NUL byte, it causes lots of trouble.  It's an invalid
-     * character anyway. */
-    if (buf[1] == NUL)
-      buf[1] = '\n';
-    return 2;
-  }
-  buf[0] = c;
-  return 1;
-}
-
-/*
- * mb_ptr2len() function pointer.
- * Get byte length of character at "*p" but stop at a NUL.
- * For UTF-8 this includes following composing characters.
- * Returns 0 when *p is NUL.
- */
-int latin_ptr2len(const char_u *p)
-{
-  return MB_BYTE2LEN(*p);
-}
-
-static int dbcs_ptr2len(const char_u *p)
-{
-  int len;
-
-  /* Check if second byte is not missing. */
-  len = MB_BYTE2LEN(*p);
-  if (len == 2 && p[1] == NUL)
-    len = 1;
-  return len;
-}
-
-/*
- * mb_ptr2len_len() function pointer.
- * Like mb_ptr2len(), but limit to read "size" bytes.
- * Returns 0 for an empty string.
- * Returns 1 for an illegal char or an incomplete byte sequence.
- */
-int latin_ptr2len_len(const char_u *p, int size)
-{
-  if (size < 1 || *p == NUL)
-    return 0;
-  return 1;
-}
-
-static int dbcs_ptr2len_len(const char_u *p, int size)
-{
-  int len;
-
-  if (size < 1 || *p == NUL)
-    return 0;
-  if (size == 1)
-    return 1;
-  /* Check that second byte is not missing. */
-  len = MB_BYTE2LEN(*p);
-  if (len == 2 && p[1] == NUL)
-    len = 1;
-  return len;
-}
-
-/*
  * Return true if "c" is in "table".
  */
 static bool intable(const struct interval *table, size_t n_items, int c)
@@ -963,16 +623,8 @@ int utf_char2cells(int c)
   return 1;
 }
 
-/*
- * mb_ptr2cells() function pointer.
- * Return the number of display cells character at "*p" occupies.
- * This doesn't take care of unprintable characters, use ptr2cells() for that.
- */
-int latin_ptr2cells(const char_u *p)
-{
-  return 1;
-}
-
+/// Return the number of display cells character at "*p" occupies.
+/// This doesn't take care of unprintable characters, use ptr2cells() for that.
 int utf_ptr2cells(const char_u *p)
 {
   int c;
@@ -991,26 +643,9 @@ int utf_ptr2cells(const char_u *p)
   return 1;
 }
 
-int dbcs_ptr2cells(const char_u *p)
-{
-  /* Number of cells is equal to number of bytes, except for euc-jp when
-   * the first byte is 0x8e. */
-  if (enc_dbcs == DBCS_JPNU && *p == 0x8e)
-    return 1;
-  return MB_BYTE2LEN(*p);
-}
-
-/*
- * mb_ptr2cells_len() function pointer.
- * Like mb_ptr2cells(), but limit string length to "size".
- * For an empty string or truncated character returns 1.
- */
-int latin_ptr2cells_len(const char_u *p, int size)
-{
-  return 1;
-}
-
-static int utf_ptr2cells_len(const char_u *p, int size)
+/// Like utf_ptr2cells(), but limit string length to "size".
+/// For an empty string or truncated character returns 1.
+int utf_ptr2cells_len(const char_u *p, int size)
 {
   int c;
 
@@ -1030,35 +665,6 @@ static int utf_ptr2cells_len(const char_u *p, int size)
   return 1;
 }
 
-static int dbcs_ptr2cells_len(const char_u *p, int size)
-{
-  /* Number of cells is equal to number of bytes, except for euc-jp when
-   * the first byte is 0x8e. */
-  if (size <= 1 || (enc_dbcs == DBCS_JPNU && *p == 0x8e))
-    return 1;
-  return MB_BYTE2LEN(*p);
-}
-
-/*
- * mb_char2cells() function pointer.
- * Return the number of display cells character "c" occupies.
- * Only takes care of multi-byte chars, not "^C" and such.
- */
-int latin_char2cells(int c)
-{
-  return 1;
-}
-
-static int dbcs_char2cells(int c)
-{
-  /* Number of cells is equal to number of bytes, except for euc-jp when
-   * the first byte is 0x8e. */
-  if (enc_dbcs == DBCS_JPNU && ((unsigned)c >> 8) == 0x8e)
-    return 1;
-  /* use the first byte */
-  return MB_BYTE2LEN((unsigned)c >> 8);
-}
-
 /// Calculate the number of cells occupied by string `str`.
 ///
 /// @param str The source string, may not be NULL, must be a NUL-terminated
@@ -1075,51 +681,14 @@ size_t mb_string2cells(const char_u *str)
   return clen;
 }
 
-/*
- * mb_off2cells() function pointer.
- * Return number of display cells for char at ScreenLines[off].
- * We make sure that the offset used is less than "max_off".
- */
-int latin_off2cells(unsigned off, unsigned max_off)
-{
-  return 1;
-}
-
-int dbcs_off2cells(unsigned off, unsigned max_off)
-{
-  /* never check beyond end of the line */
-  if (off >= max_off)
-    return 1;
-
-  /* Number of cells is equal to number of bytes, except for euc-jp when
-   * the first byte is 0x8e. */
-  if (enc_dbcs == DBCS_JPNU && ScreenLines[off] == 0x8e)
-    return 1;
-  return MB_BYTE2LEN(ScreenLines[off]);
-}
-
+/// Return number of display cells for char at ScreenLines[off].
+/// We make sure that the offset used is less than "max_off".
 int utf_off2cells(unsigned off, unsigned max_off)
 {
   return (off + 1 < max_off && ScreenLines[off + 1] == 0) ? 2 : 1;
 }
 
 /*
- * mb_ptr2char() function pointer.
- * Convert a byte sequence into a character.
- */
-int latin_ptr2char(const char_u *p)
-{
-  return *p;
-}
-
-static int dbcs_ptr2char(const char_u *p)
-{
-  if (MB_BYTE2LEN(*p) > 1 && p[1] != NUL)
-    return (p[0] << 8) + p[1];
-  return *p;
-}
-
-/*
  * Convert a UTF-8 byte sequence to a wide character.
  * If the sequence is illegal or truncated by a NUL the first byte is
  * returned.
@@ -2065,68 +1634,9 @@ void show_utf8(void)
   msg(IObuff);
 }
 
-/*
- * mb_head_off() function pointer.
- * Return offset from "p" to the first byte of the character it points into.
- * If "p" points to the NUL at the end of the string return 0.
- * Returns 0 when already at the first byte of a character.
- */
-int latin_head_off(const char_u *base, const char_u *p)
-{
-  return 0;
-}
-
-int dbcs_head_off(const char_u *base, const char_u *p)
-{
-  /* It can't be a trailing byte when not using DBCS, at the start of the
-   * string or the previous byte can't start a double-byte. */
-  if (p <= base || MB_BYTE2LEN(p[-1]) == 1 || *p == NUL) {
-    return 0;
-  }
-
-  /* This is slow: need to start at the base and go forward until the
-   * byte we are looking for.  Return 1 when we went past it, 0 otherwise. */
-  const char_u *q = base;
-  while (q < p) {
-    q += dbcs_ptr2len(q);
-  }
-
-  return (q == p) ? 0 : 1;
-}
-
-/*
- * Special version of dbcs_head_off() that works for ScreenLines[], where
- * single-width DBCS_JPNU characters are stored separately.
- */
-int dbcs_screen_head_off(const char_u *base, const char_u *p)
-{
-  /* It can't be a trailing byte when not using DBCS, at the start of the
-   * string or the previous byte can't start a double-byte.
-   * For euc-jp an 0x8e byte in the previous cell always means we have a
-   * lead byte in the current cell. */
-  if (p <= base
-      || (enc_dbcs == DBCS_JPNU && p[-1] == 0x8e)
-      || MB_BYTE2LEN(p[-1]) == 1
-      || *p == NUL)
-    return 0;
-
-  /* This is slow: need to start at the base and go forward until the
-   * byte we are looking for.  Return 1 when we went past it, 0 otherwise.
-   * For DBCS_JPNU look out for 0x8e, which means the second byte is not
-   * stored as the next byte. */
-  const char_u *q = base;
-  while (q < p) {
-    if (enc_dbcs == DBCS_JPNU && *q == 0x8e) {
-      ++q;
-    }
-    else {
-      q += dbcs_ptr2len(q);
-    }
-  }
-
-  return (q == p) ? 0 : 1;
-}
-
+/// Return offset from "p" to the first byte of the character it points into.
+/// If "p" points to the NUL at the end of the string return 0.
+/// Returns 0 when already at the first byte of a character.
 int utf_head_off(const char_u *base, const char_u *p)
 {
   int c;
@@ -2232,26 +1742,20 @@ int mb_tail_off(char_u *base, char_u *p)
   if (*p == NUL)
     return 0;
 
-  if (enc_utf8) {
-    /* Find the last character that is 10xx.xxxx */
-    for (i = 0; (p[i + 1] & 0xc0) == 0x80; ++i)
-      ;
-    /* Check for illegal sequence. */
-    for (j = 0; p - j > base; ++j)
-      if ((p[-j] & 0xc0) != 0x80)
-        break;
-    if (utf8len_tab[p[-j]] != i + j + 1)
-      return 0;
-    return i;
+  // Find the last character that is 10xx.xxxx
+  for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
+
+  // Check for illegal sequence.
+  for (j = 0; p - j > base; j++) {
+    if ((p[-j] & 0xc0) != 0x80) {
+      break;
+    }
   }
 
-  /* It can't be the first byte if a double-byte when not using DBCS, at the
-   * end of the string or the byte can't start a double-byte. */
-  if (enc_dbcs == 0 || p[1] == NUL || MB_BYTE2LEN(*p) == 1)
+  if (utf8len_tab[p[-j]] != i + j + 1) {
     return 0;
-
-  /* Return 1 when on the lead byte, 0 when on the tail byte. */
-  return 1 - dbcs_head_off(base, p);
+  }
+  return i;
 }
 
 /*
@@ -2466,13 +1970,10 @@ int mb_fix_col(int col, int row)
 {
   col = check_col(col);
   row = check_row(row);
-  if (has_mbyte && ScreenLines != NULL && col > 0
-      && ((enc_dbcs
-          && ScreenLines[LineOffset[row] + col] != NUL
-          && dbcs_screen_head_off(ScreenLines + LineOffset[row],
-            ScreenLines + LineOffset[row] + col))
-        || (enc_utf8 && ScreenLines[LineOffset[row] + col] == 0)))
+  if (ScreenLines != NULL && col > 0
+      && ScreenLines[LineOffset[row] + col] == 0) {
     return col - 1;
+  }
   return col;
 }
author	Björn Linse <bjorn.linse@gmail.com>	2016-11-05 18:12:14 +0100
committer	GitHub <noreply@github.com>	2016-11-05 18:12:14 +0100
commit	9147331e212e8d0fff0f30344007faffd42609ee (patch)
tree	6d1853a72b94ce19ba0cc163c664a9a206da0091 /src/nvim/mbyte.c
parent	32d9c19e294f38a6adae6e055fc606fc3fd33f2f (diff)
parent	4ab3fe8eaadb5456eeafc49df2fb0ecf71d836cc (diff)
download	rneovim-9147331e212e8d0fff0f30344007faffd42609ee.tar.gz rneovim-9147331e212e8d0fff0f30344007faffd42609ee.tar.bz2 rneovim-9147331e212e8d0fff0f30344007faffd42609ee.zip