diff options
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r-- | src/nvim/mbyte.c | 808 |
1 files changed, 449 insertions, 359 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c index cba372b9d3..fea1ab77a2 100644 --- a/src/nvim/mbyte.c +++ b/src/nvim/mbyte.c @@ -31,30 +31,30 @@ #include <wchar.h> #include <wctype.h> -#include "nvim/vim.h" #include "nvim/ascii.h" +#include "nvim/vim.h" #ifdef HAVE_LOCALE_H # include <locale.h> #endif -#include "nvim/eval.h" -#include "nvim/path.h" -#include "nvim/iconv.h" -#include "nvim/mbyte.h" +#include "nvim/arabic.h" #include "nvim/charset.h" #include "nvim/cursor.h" +#include "nvim/eval.h" #include "nvim/fileio.h" #include "nvim/func_attr.h" +#include "nvim/iconv.h" +#include "nvim/mark.h" +#include "nvim/mbyte.h" #include "nvim/memline.h" +#include "nvim/memory.h" #include "nvim/message.h" #include "nvim/misc1.h" -#include "nvim/memory.h" #include "nvim/option.h" +#include "nvim/os/os.h" +#include "nvim/path.h" #include "nvim/screen.h" #include "nvim/spell.h" #include "nvim/strings.h" -#include "nvim/os/os.h" -#include "nvim/arabic.h" -#include "nvim/mark.h" typedef struct { int rangeStart; @@ -70,6 +70,7 @@ struct interval { #ifdef INCLUDE_GENERATED_DECLARATIONS # include "mbyte.c.generated.h" + # include "unicode_tables.generated.h" #endif @@ -127,131 +128,131 @@ static struct enc_canon_table[] = { #define IDX_LATIN_1 0 - {"latin1", ENC_8BIT + ENC_LATIN1, 1252}, + { "latin1", ENC_8BIT + ENC_LATIN1, 1252 }, #define IDX_ISO_2 1 - {"iso-8859-2", ENC_8BIT, 0}, + { "iso-8859-2", ENC_8BIT, 0 }, #define IDX_ISO_3 2 - {"iso-8859-3", ENC_8BIT, 0}, + { "iso-8859-3", ENC_8BIT, 0 }, #define IDX_ISO_4 3 - {"iso-8859-4", ENC_8BIT, 0}, + { "iso-8859-4", ENC_8BIT, 0 }, #define IDX_ISO_5 4 - {"iso-8859-5", ENC_8BIT, 0}, + { "iso-8859-5", ENC_8BIT, 0 }, #define IDX_ISO_6 5 - {"iso-8859-6", ENC_8BIT, 0}, + { "iso-8859-6", ENC_8BIT, 0 }, #define IDX_ISO_7 6 - {"iso-8859-7", ENC_8BIT, 0}, + { "iso-8859-7", ENC_8BIT, 0 }, #define IDX_ISO_8 7 - {"iso-8859-8", ENC_8BIT, 0}, + { "iso-8859-8", ENC_8BIT, 0 }, #define IDX_ISO_9 8 - {"iso-8859-9", ENC_8BIT, 0}, + { "iso-8859-9", ENC_8BIT, 0 }, #define IDX_ISO_10 9 - {"iso-8859-10", ENC_8BIT, 0}, + { "iso-8859-10", ENC_8BIT, 0 }, #define IDX_ISO_11 10 - {"iso-8859-11", ENC_8BIT, 0}, + { "iso-8859-11", ENC_8BIT, 0 }, #define IDX_ISO_13 11 - {"iso-8859-13", ENC_8BIT, 0}, + { "iso-8859-13", ENC_8BIT, 0 }, #define IDX_ISO_14 12 - {"iso-8859-14", ENC_8BIT, 0}, + { "iso-8859-14", ENC_8BIT, 0 }, #define IDX_ISO_15 13 - {"iso-8859-15", ENC_8BIT + ENC_LATIN9, 0}, + { "iso-8859-15", ENC_8BIT + ENC_LATIN9, 0 }, #define IDX_KOI8_R 14 - {"koi8-r", ENC_8BIT, 0}, + { "koi8-r", ENC_8BIT, 0 }, #define IDX_KOI8_U 15 - {"koi8-u", ENC_8BIT, 0}, + { "koi8-u", ENC_8BIT, 0 }, #define IDX_UTF8 16 - {"utf-8", ENC_UNICODE, 0}, + { "utf-8", ENC_UNICODE, 0 }, #define IDX_UCS2 17 - {"ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0}, + { "ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0 }, #define IDX_UCS2LE 18 - {"ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0}, + { "ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0 }, #define IDX_UTF16 19 - {"utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0}, + { "utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0 }, #define IDX_UTF16LE 20 - {"utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0}, + { "utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0 }, #define IDX_UCS4 21 - {"ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0}, + { "ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0 }, #define IDX_UCS4LE 22 - {"ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0}, + { "ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0 }, - /* For debugging DBCS encoding on Unix. */ + // For debugging DBCS encoding on Unix. #define IDX_DEBUG 23 - {"debug", ENC_DBCS, DBCS_DEBUG}, + { "debug", ENC_DBCS, DBCS_DEBUG }, #define IDX_EUC_JP 24 - {"euc-jp", ENC_DBCS, DBCS_JPNU}, + { "euc-jp", ENC_DBCS, DBCS_JPNU }, #define IDX_SJIS 25 - {"sjis", ENC_DBCS, DBCS_JPN}, + { "sjis", ENC_DBCS, DBCS_JPN }, #define IDX_EUC_KR 26 - {"euc-kr", ENC_DBCS, DBCS_KORU}, + { "euc-kr", ENC_DBCS, DBCS_KORU }, #define IDX_EUC_CN 27 - {"euc-cn", ENC_DBCS, DBCS_CHSU}, + { "euc-cn", ENC_DBCS, DBCS_CHSU }, #define IDX_EUC_TW 28 - {"euc-tw", ENC_DBCS, DBCS_CHTU}, + { "euc-tw", ENC_DBCS, DBCS_CHTU }, #define IDX_BIG5 29 - {"big5", ENC_DBCS, DBCS_CHT}, + { "big5", ENC_DBCS, DBCS_CHT }, /* MS-DOS and MS-Windows codepages are included here, so that they can be * used on Unix too. Most of them are similar to ISO-8859 encodings, but * not exactly the same. */ #define IDX_CP437 30 - {"cp437", ENC_8BIT, 437}, /* like iso-8859-1 */ + { "cp437", ENC_8BIT, 437 }, // like iso-8859-1 #define IDX_CP737 31 - {"cp737", ENC_8BIT, 737}, /* like iso-8859-7 */ + { "cp737", ENC_8BIT, 737 }, // like iso-8859-7 #define IDX_CP775 32 - {"cp775", ENC_8BIT, 775}, /* Baltic */ + { "cp775", ENC_8BIT, 775 }, // Baltic #define IDX_CP850 33 - {"cp850", ENC_8BIT, 850}, /* like iso-8859-4 */ + { "cp850", ENC_8BIT, 850 }, // like iso-8859-4 #define IDX_CP852 34 - {"cp852", ENC_8BIT, 852}, /* like iso-8859-1 */ + { "cp852", ENC_8BIT, 852 }, // like iso-8859-1 #define IDX_CP855 35 - {"cp855", ENC_8BIT, 855}, /* like iso-8859-2 */ + { "cp855", ENC_8BIT, 855 }, // like iso-8859-2 #define IDX_CP857 36 - {"cp857", ENC_8BIT, 857}, /* like iso-8859-5 */ + { "cp857", ENC_8BIT, 857 }, // like iso-8859-5 #define IDX_CP860 37 - {"cp860", ENC_8BIT, 860}, /* like iso-8859-9 */ + { "cp860", ENC_8BIT, 860 }, // like iso-8859-9 #define IDX_CP861 38 - {"cp861", ENC_8BIT, 861}, /* like iso-8859-1 */ + { "cp861", ENC_8BIT, 861 }, // like iso-8859-1 #define IDX_CP862 39 - {"cp862", ENC_8BIT, 862}, /* like iso-8859-1 */ + { "cp862", ENC_8BIT, 862 }, // like iso-8859-1 #define IDX_CP863 40 - {"cp863", ENC_8BIT, 863}, /* like iso-8859-8 */ + { "cp863", ENC_8BIT, 863 }, // like iso-8859-8 #define IDX_CP865 41 - {"cp865", ENC_8BIT, 865}, /* like iso-8859-1 */ + { "cp865", ENC_8BIT, 865 }, // like iso-8859-1 #define IDX_CP866 42 - {"cp866", ENC_8BIT, 866}, /* like iso-8859-5 */ + { "cp866", ENC_8BIT, 866 }, // like iso-8859-5 #define IDX_CP869 43 - {"cp869", ENC_8BIT, 869}, /* like iso-8859-7 */ + { "cp869", ENC_8BIT, 869 }, // like iso-8859-7 #define IDX_CP874 44 - {"cp874", ENC_8BIT, 874}, /* Thai */ + { "cp874", ENC_8BIT, 874 }, // Thai #define IDX_CP932 45 - {"cp932", ENC_DBCS, DBCS_JPN}, + { "cp932", ENC_DBCS, DBCS_JPN }, #define IDX_CP936 46 - {"cp936", ENC_DBCS, DBCS_CHS}, + { "cp936", ENC_DBCS, DBCS_CHS }, #define IDX_CP949 47 - {"cp949", ENC_DBCS, DBCS_KOR}, + { "cp949", ENC_DBCS, DBCS_KOR }, #define IDX_CP950 48 - {"cp950", ENC_DBCS, DBCS_CHT}, + { "cp950", ENC_DBCS, DBCS_CHT }, #define IDX_CP1250 49 - {"cp1250", ENC_8BIT, 1250}, /* Czech, Polish, etc. */ + { "cp1250", ENC_8BIT, 1250 }, // Czech, Polish, etc. #define IDX_CP1251 50 - {"cp1251", ENC_8BIT, 1251}, /* Cyrillic */ - /* cp1252 is considered to be equal to latin1 */ + { "cp1251", ENC_8BIT, 1251 }, // Cyrillic + // cp1252 is considered to be equal to latin1 #define IDX_CP1253 51 - {"cp1253", ENC_8BIT, 1253}, /* Greek */ + { "cp1253", ENC_8BIT, 1253 }, // Greek #define IDX_CP1254 52 - {"cp1254", ENC_8BIT, 1254}, /* Turkish */ + { "cp1254", ENC_8BIT, 1254 }, // Turkish #define IDX_CP1255 53 - {"cp1255", ENC_8BIT, 1255}, /* Hebrew */ + { "cp1255", ENC_8BIT, 1255 }, // Hebrew #define IDX_CP1256 54 - {"cp1256", ENC_8BIT, 1256}, /* Arabic */ + { "cp1256", ENC_8BIT, 1256 }, // Arabic #define IDX_CP1257 55 - {"cp1257", ENC_8BIT, 1257}, /* Baltic */ + { "cp1257", ENC_8BIT, 1257 }, // Baltic #define IDX_CP1258 56 - {"cp1258", ENC_8BIT, 1258}, /* Vietnamese */ + { "cp1258", ENC_8BIT, 1258 }, // Vietnamese #define IDX_MACROMAN 57 - {"macroman", ENC_8BIT + ENC_MACROMAN, 0}, /* Mac OS */ + { "macroman", ENC_8BIT + ENC_MACROMAN, 0 }, // Mac OS #define IDX_HPROMAN8 58 - {"hp-roman8", ENC_8BIT, 0}, /* HP Roman8 */ + { "hp-roman8", ENC_8BIT, 0 }, // HP Roman8 #define IDX_COUNT 59 }; @@ -336,9 +337,11 @@ static int enc_canon_search(const char_u *name) { int i; - for (i = 0; i < IDX_COUNT; ++i) - if (STRCMP(name, enc_canon_table[i].name) == 0) + for (i = 0; i < IDX_COUNT; ++i) { + if (STRCMP(name, enc_canon_table[i].name) == 0) { return i; + } + } return -1; } @@ -353,12 +356,13 @@ int enc_canon_props(const char_u *name) int i; i = enc_canon_search(name); - if (i >= 0) + if (i >= 0) { return enc_canon_table[i].prop; - if (STRNCMP(name, "2byte-", 6) == 0) + } else if (STRNCMP(name, "2byte-", 6) == 0) { return ENC_DBCS; - if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0) + } else if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0) { return ENC_8BIT; + } return 0; } @@ -436,21 +440,23 @@ static bool intable(const struct interval *table, size_t n_items, int c) { int mid, bot, top; - /* first quick check for Latin1 etc. characters */ - if (c < table[0].first) + // first quick check for Latin1 etc. characters + if (c < table[0].first) { return false; + } - /* binary search in table */ + // binary search in table bot = 0; top = (int)(n_items - 1); while (top >= bot) { mid = (bot + top) / 2; - if (table[mid].last < c) + if (table[mid].last < c) { bot = mid + 1; - else if (table[mid].first > c) + } else if (table[mid].first > c) { top = mid - 1; - else + } else { return true; + } } return false; } @@ -512,12 +518,14 @@ int utf_ptr2cells(const char_u *p) // Need to convert to a character number. if (*p >= 0x80) { c = utf_ptr2char(p); - /* An illegal byte is displayed as <xx>. */ - if (utf_ptr2len(p) == 1 || c == NUL) + // An illegal byte is displayed as <xx>. + if (utf_ptr2len(p) == 1 || c == NUL) { return 4; - /* If the char is ASCII it must be an overlong sequence. */ - if (c < 0x80) + } + // If the char is ASCII it must be an overlong sequence. + if (c < 0x80) { return char2cells(c); + } return utf_char2cells(c); } return 1; @@ -529,17 +537,20 @@ int utf_ptr2cells_len(const char_u *p, int size) { int c; - /* Need to convert to a wide character. */ + // Need to convert to a wide character. if (size > 0 && *p >= 0x80) { - if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) - return 1; /* truncated */ + if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) { + return 1; // truncated + } c = utf_ptr2char(p); - /* An illegal byte is displayed as <xx>. */ - if (utf_ptr2len(p) == 1 || c == NUL) + // An illegal byte is displayed as <xx>. + if (utf_ptr2len(p) == 1 || c == NUL) { return 4; - /* If the char is ASCII it must be an overlong sequence. */ - if (c < 0x80) + } + // If the char is ASCII it must be an overlong sequence. + if (c < 0x80) { return char2cells(c); + } return utf_char2cells(c); } return 1; @@ -651,13 +662,14 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n) { int c; - if (*n == 0) /* end of buffer */ + if (*n == 0) { // end of buffer return 0; + } uint8_t k = utf8len_tab_zero[**s]; if (k == 1) { - /* ASCII character or NUL */ + // ASCII character or NUL (*n)--; return *(*s)++; } @@ -674,14 +686,14 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n) * U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too. * It's safe even if n=1, else we would have k=2 > n. */ if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) { - /* byte sequence was successfully decoded */ + // byte sequence was successfully decoded *s += k; *n -= k; return c; } } - /* byte sequence is incomplete or illegal */ + // byte sequence is incomplete or illegal return -1; } @@ -721,10 +733,12 @@ bool utf_composinglike(const char_u *p1, const char_u *p2) int c2; c2 = utf_ptr2char(p2); - if (utf_iscomposing(c2)) + if (utf_iscomposing(c2)) { return true; - if (!arabic_maycombine(c2)) + } + if (!arabic_maycombine(c2)) { return false; + } return arabic_combine(utf_ptr2char(p1), c2); } @@ -746,23 +760,26 @@ int utfc_ptr2char(const char_u *p, int *pcc) c = utf_ptr2char(p); len = utf_ptr2len(p); - /* Only accept a composing char when the first char isn't illegal. */ + // Only accept a composing char when the first char isn't illegal. if ((len > 1 || *p < 0x80) && p[len] >= 0x80 && UTF_COMPOSINGLIKE(p, p + len)) { cc = utf_ptr2char(p + len); for (;; ) { pcc[i++] = cc; - if (i == MAX_MCO) + if (i == MAX_MCO) { break; + } len += utf_ptr2len(p + len); - if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) + if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) { break; + } } } - if (i < MAX_MCO) /* last composing char must be 0 */ + if (i < MAX_MCO) { // last composing char must be 0 pcc[i] = 0; + } return c; } @@ -855,15 +872,19 @@ int utf_ptr2len_len(const char_u *p, int size) int m; len = utf8len_tab[*p]; - if (len == 1) - return 1; /* NUL, ascii or illegal lead byte */ - if (len > size) - m = size; /* incomplete byte sequence. */ - else + if (len == 1) { + return 1; // NUL, ascii or illegal lead byte + } + if (len > size) { + m = size; // incomplete byte sequence. + } else { m = len; - for (i = 1; i < m; ++i) - if ((p[i] & 0xc0) != 0x80) + } + for (i = 1; i < m; ++i) { + if ((p[i] & 0xc0) != 0x80) { return 1; + } + } return len; } @@ -915,17 +936,20 @@ int utfc_ptr2len_len(const char_u *p, int size) int len; int prevlen; - if (size < 1 || *p == NUL) + if (size < 1 || *p == NUL) { return 0; - if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) /* be quick for ASCII */ + } + if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) { // be quick for ASCII return 1; + } - /* Skip over first UTF-8 char, stopping at a NUL byte. */ + // Skip over first UTF-8 char, stopping at a NUL byte. len = utf_ptr2len_len(p, size); - /* Check for illegal byte and incomplete byte sequence. */ - if ((len == 1 && p[0] >= 0x80) || len > size) + // Check for illegal byte and incomplete byte sequence. + if ((len == 1 && p[0] >= 0x80) || len > size) { return 1; + } /* * Check for composing characters. We can handle only the first six, but @@ -935,21 +959,24 @@ int utfc_ptr2len_len(const char_u *p, int size) while (len < size) { int len_next_char; - if (p[len] < 0x80) + if (p[len] < 0x80) { break; + } /* * Next character length should not go beyond size to ensure that * UTF_COMPOSINGLIKE(...) does not read beyond size. */ len_next_char = utf_ptr2len_len(p + len, size - len); - if (len_next_char > size - len) + if (len_next_char > size - len) { break; + } - if (!UTF_COMPOSINGLIKE(p + prevlen, p + len)) + if (!UTF_COMPOSINGLIKE(p + prevlen, p + len)) { break; + } - /* Skip over composing char */ + // Skip over composing char prevlen = len; len += len_next_char; } @@ -1043,9 +1070,9 @@ bool utf_printable(int c) * 0xd800-0xdfff is reserved for UTF-16, actually illegal. */ static struct interval nonprint[] = { - {0x070f, 0x070f}, {0x180b, 0x180e}, {0x200b, 0x200f}, {0x202a, 0x202e}, - {0x206a, 0x206f}, {0xd800, 0xdfff}, {0xfeff, 0xfeff}, {0xfff9, 0xfffb}, - {0xfffe, 0xffff} + { 0x070f, 0x070f }, { 0x180b, 0x180e }, { 0x200b, 0x200f }, { 0x202a, 0x202e }, + { 0x206a, 0x206f }, { 0xd800, 0xdfff }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb }, + { 0xfffe, 0xffff } }; return !intable(nonprint, ARRAY_SIZE(nonprint), c); @@ -1065,7 +1092,7 @@ int utf_class(const int c) int utf_class_tab(const int c, const uint64_t *const chartab) { - /* sorted list of non-overlapping intervals */ + // sorted list of non-overlapping intervals static struct clinterval { unsigned int first; unsigned int last; @@ -1147,7 +1174,7 @@ int utf_class_tab(const int c, const uint64_t *const chartab) int top = ARRAY_SIZE(classes) - 1; int mid; - /* First quick check for Latin1 characters, use 'iskeyword'. */ + // First quick check for Latin1 characters, use 'iskeyword'. if (c < 0x100) { if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) { return 0; // blank @@ -1158,15 +1185,16 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 1; // punctuation } - /* binary search in table */ + // binary search in table while (top >= bot) { mid = (bot + top) / 2; - if (classes[mid].last < (unsigned int)c) + if (classes[mid].last < (unsigned int)c) { bot = mid + 1; - else if (classes[mid].first > (unsigned int)c) + } else if (classes[mid].first > (unsigned int)c) { top = mid - 1; - else + } else { return (int)classes[mid].class; + } } // emoji @@ -1174,7 +1202,7 @@ int utf_class_tab(const int c, const uint64_t *const chartab) return 3; } - /* most other characters are "word" characters */ + // most other characters are "word" characters return 2; } @@ -1191,25 +1219,27 @@ bool utf_ambiguous_width(int c) */ static int utf_convert(int a, const convertStruct *const table, size_t n_items) { - size_t start, mid, end; /* indices into table */ + size_t start, mid, end; // indices into table start = 0; end = n_items; while (start < end) { - /* need to search further */ + // need to search further mid = (end + start) / 2; - if (table[mid].rangeEnd < a) + if (table[mid].rangeEnd < a) { start = mid + 1; - else + } else { end = mid; + } } if (start < n_items && table[start].rangeStart <= a && a <= table[start].rangeEnd - && (a - table[start].rangeStart) % table[start].step == 0) + && (a - table[start].rangeStart) % table[start].step == 0) { return a + table[start].offset; - else + } else { return a; + } } /* @@ -1234,21 +1264,24 @@ int utf_fold(int a) /// simple case folding. int mb_toupper(int a) { - /* If 'casemap' contains "keepascii" use ASCII style toupper(). */ - if (a < 128 && (cmp_flags & CMP_KEEPASCII)) + // If 'casemap' contains "keepascii" use ASCII style toupper(). + if (a < 128 && (cmp_flags & CMP_KEEPASCII)) { return TOUPPER_ASC(a); + } #if defined(__STDC_ISO_10646__) - /* If towupper() is available and handles Unicode, use it. */ - if (!(cmp_flags & CMP_INTERNAL)) + // If towupper() is available and handles Unicode, use it. + if (!(cmp_flags & CMP_INTERNAL)) { return towupper(a); + } #endif - /* For characters below 128 use locale sensitive toupper(). */ - if (a < 128) + // For characters below 128 use locale sensitive toupper(). + if (a < 128) { return TOUPPER_LOC(a); + } - /* For any other characters use the above mapping table. */ + // For any other characters use the above mapping table. return utf_convert(a, toUpper, ARRAY_SIZE(toUpper)); } @@ -1262,21 +1295,24 @@ bool mb_islower(int a) /// simple case folding. int mb_tolower(int a) { - /* If 'casemap' contains "keepascii" use ASCII style tolower(). */ - if (a < 128 && (cmp_flags & CMP_KEEPASCII)) + // If 'casemap' contains "keepascii" use ASCII style tolower(). + if (a < 128 && (cmp_flags & CMP_KEEPASCII)) { return TOLOWER_ASC(a); + } #if defined(__STDC_ISO_10646__) - /* If towlower() is available and handles Unicode, use it. */ - if (!(cmp_flags & CMP_INTERNAL)) + // If towlower() is available and handles Unicode, use it. + if (!(cmp_flags & CMP_INTERNAL)) { return towlower(a); + } #endif - /* For characters below 128 use locale sensitive tolower(). */ - if (a < 128) + // For characters below 128 use locale sensitive tolower(). + if (a < 128) { return TOLOWER_LOC(a); + } - /* For any other characters use the above mapping table. */ + // For any other characters use the above mapping table. return utf_convert(a, toLower, ARRAY_SIZE(toLower)); } @@ -1285,8 +1321,7 @@ bool mb_isupper(int a) return mb_tolower(a) != a; } -static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, - size_t n2) +static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2) { int c1, c2, cdiff; char_u buffer[6]; @@ -1295,23 +1330,27 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, c1 = utf_safe_read_char_adv(&s1, &n1); c2 = utf_safe_read_char_adv(&s2, &n2); - if (c1 <= 0 || c2 <= 0) + if (c1 <= 0 || c2 <= 0) { break; + } - if (c1 == c2) + if (c1 == c2) { continue; + } cdiff = utf_fold(c1) - utf_fold(c2); - if (cdiff != 0) + if (cdiff != 0) { return cdiff; + } } - /* some string ended or has an incomplete/illegal character sequence */ + // some string ended or has an incomplete/illegal character sequence if (c1 == 0 || c2 == 0) { - /* some string ended. shorter string is smaller */ - if (c1 == 0 && c2 == 0) + // some string ended. shorter string is smaller + if (c1 == 0 && c2 == 0) { return 0; + } return c1 == 0 ? -1 : 1; } @@ -1332,8 +1371,9 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) { cdiff = (int)(*s1) - (int)(*s2); - if (cdiff != 0) + if (cdiff != 0) { return cdiff; + } s1++; s2++; @@ -1341,19 +1381,22 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, n2--; } - if (n1 > 0 && *s1 == NUL) + if (n1 > 0 && *s1 == NUL) { n1 = 0; - if (n2 > 0 && *s2 == NUL) + } + if (n2 > 0 && *s2 == NUL) { n2 = 0; + } - if (n1 == 0 && n2 == 0) + if (n1 == 0 && n2 == 0) { return 0; + } return n1 == 0 ? -1 : 1; } #ifdef WIN32 #ifndef CP_UTF8 -# define CP_UTF8 65001 /* magic number from winnls.h */ +# define CP_UTF8 65001 // magic number from winnls.h #endif /// Converts string from UTF-8 to UTF-16. @@ -1453,8 +1496,7 @@ int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8) /// @param len maximum length (an earlier NUL terminates) /// @param[out] codepoints incremented with UTF-32 code point size /// @param[out] codeunits incremented with UTF-16 code unit size -void mb_utflen(const char_u *s, size_t len, size_t *codepoints, - size_t *codeunits) +void mb_utflen(const char_u *s, size_t len, size_t *codepoints, size_t *codeunits) FUNC_ATTR_NONNULL_ALL { size_t count = 0, extra = 0; @@ -1473,8 +1515,7 @@ void mb_utflen(const char_u *s, size_t len, size_t *codepoints, *codeunits += count + extra; } -ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len, - size_t index, bool use_utf16_units) +ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len, size_t index, bool use_utf16_units) FUNC_ATTR_NONNULL_ALL { size_t count = 0; @@ -1537,7 +1578,7 @@ void show_utf8(void) { int len; int rlen = 0; - char_u *line; + char_u *line; int clen; int i; @@ -1553,7 +1594,7 @@ void show_utf8(void) clen = 0; for (i = 0; i < len; ++i) { if (clen == 0) { - /* start of (composing) character, get its length */ + // start of (composing) character, get its length if (i > 0) { STRCPY(IObuff + rlen, "+ "); rlen += 2; @@ -1561,11 +1602,12 @@ void show_utf8(void) clen = utf_ptr2len(line + i); } sprintf((char *)IObuff + rlen, "%02x ", - (line[i] == NL) ? NUL : line[i]); /* NUL is stored as NL */ + (line[i] == NL) ? NUL : line[i]); // NUL is stored as NL --clen; rlen += (int)STRLEN(IObuff + rlen); - if (rlen > IOSIZE - 20) + if (rlen > IOSIZE - 20) { break; + } } msg(IObuff); @@ -1579,42 +1621,49 @@ int utf_head_off(const char_u *base, const char_u *p) int c; int len; - if (*p < 0x80) /* be quick for ASCII */ + if (*p < 0x80) { // be quick for ASCII return 0; + } /* Skip backwards over trailing bytes: 10xx.xxxx * Skip backwards again if on a composing char. */ const char_u *q; for (q = p;; --q) { - /* Move s to the last byte of this char. */ + // Move s to the last byte of this char. const char_u *s; for (s = q; (s[1] & 0xc0) == 0x80; ++s) {} - /* Move q to the first byte of this char. */ - while (q > base && (*q & 0xc0) == 0x80) + // Move q to the first byte of this char. + while (q > base && (*q & 0xc0) == 0x80) { --q; + } /* Check for illegal sequence. Do allow an illegal byte after where we * started. */ len = utf8len_tab[*q]; - if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) + if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) { return 0; + } - if (q <= base) + if (q <= base) { break; + } c = utf_ptr2char(q); - if (utf_iscomposing(c)) + if (utf_iscomposing(c)) { continue; + } if (arabic_maycombine(c)) { - /* Advance to get a sneak-peak at the next char */ + // Advance to get a sneak-peak at the next char const char_u *j = q; --j; - /* Move j to the first byte of this char. */ - while (j > base && (*j & 0xc0) == 0x80) + // Move j to the first byte of this char. + while (j > base && (*j & 0xc0) == 0x80) { --j; - if (arabic_combine(utf_ptr2char(j), c)) + } + if (arabic_combine(utf_ptr2char(j), c)) { continue; + } } break; } @@ -1627,12 +1676,12 @@ bool utf_eat_space(int cc) FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT { return (cc >= 0x2000 && cc <= 0x206F) // General punctuations - || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations - || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations - || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations - || (cc >= 0xff1a && cc <= 0xff20) // .. - || (cc >= 0xff3b && cc <= 0xff40) // .. - || (cc >= 0xff5b && cc <= 0xff65); // .. + || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations + || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations + || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations + || (cc >= 0xff1a && cc <= 0xff20) // .. + || (cc >= 0xff3b && cc <= 0xff40) // .. + || (cc >= 0xff5b && cc <= 0xff65); // .. } // Whether line break is allowed before "cc". @@ -1814,8 +1863,9 @@ int mb_tail_off(char_u *base, char_u *p) int i; int j; - if (*p == NUL) + if (*p == NUL) { return 0; + } // Find the last character that is 10xx.xxxx for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {} @@ -1839,10 +1889,10 @@ int mb_tail_off(char_u *base, char_u *p) void utf_find_illegal(void) { pos_T pos = curwin->w_cursor; - char_u *p; + char_u *p; int len; vimconv_T vimconv; - char_u *tofree = NULL; + char_u *tofree = NULL; vimconv.vc_type = CONV_NONE; if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) { @@ -1858,8 +1908,9 @@ void utf_find_illegal(void) if (vimconv.vc_type != CONV_NONE) { xfree(tofree); tofree = string_convert(&vimconv, p, NULL); - if (tofree == NULL) + if (tofree == NULL) { break; + } p = tofree; } @@ -1868,10 +1919,10 @@ void utf_find_illegal(void) * utf_ptr2len()) or too many of them (overlong sequence). */ len = utf_ptr2len(p); if (*p >= 0x80 && (len == 1 - || utf_char2len(utf_ptr2char(p)) != len)) { - if (vimconv.vc_type == CONV_NONE) + || utf_char2len(utf_ptr2char(p)) != len)) { + if (vimconv.vc_type == CONV_NONE) { curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr()); - else { + } else { int l; len = (int)(p - tofree); @@ -1884,13 +1935,14 @@ void utf_find_illegal(void) } p += len; } - if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) + if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) { break; + } ++curwin->w_cursor.lnum; curwin->w_cursor.col = 0; } - /* didn't find it: don't move and beep */ + // didn't find it: don't move and beep curwin->w_cursor = pos; beep_flush(); @@ -1944,13 +1996,10 @@ void mb_check_adjust_col(void *win_) } } -/* - * Return a pointer to the character before "*p", if there is one. - */ -char_u * mb_prevptr( - char_u *line, /* start of the string */ - char_u *p - ) +/// @param line start of the string +/// +/// @return a pointer to the character before "*p", if there is one. +char_u * mb_prevptr(char_u *line, char_u *p) { if (p > line) { MB_PTR_BACK(line, p); @@ -1964,14 +2013,16 @@ char_u * mb_prevptr( */ int mb_charlen(char_u *str) { - char_u *p = str; + char_u *p = str; int count; - if (p == NULL) + if (p == NULL) { return 0; + } - for (count = 0; *p != NUL; count++) + for (count = 0; *p != NUL; count++) { p += (*mb_ptr2len)(p); + } return count; } @@ -1981,11 +2032,12 @@ int mb_charlen(char_u *str) */ int mb_charlen_len(char_u *str, int len) { - char_u *p = str; + char_u *p = str; int count; - for (count = 0; *p != NUL && p < str + len; count++) + for (count = 0; *p != NUL && p < str + len; count++) { p += (*mb_ptr2len)(p); + } return count; } @@ -2049,10 +2101,12 @@ const char *mb_unescape(const char **const pp) */ char_u * enc_skip(char_u *p) { - if (STRNCMP(p, "2byte-", 6) == 0) + if (STRNCMP(p, "2byte-", 6) == 0) { return p + 6; - if (STRNCMP(p, "8bit-", 5) == 0) + } + if (STRNCMP(p, "8bit-", 5) == 0) { return p + 5; + } return p; } @@ -2064,7 +2118,7 @@ char_u * enc_skip(char_u *p) */ char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET { - char_u *p, *s; + char_u *p, *s; int i; if (STRCMP(enc, "default") == 0) { @@ -2072,47 +2126,51 @@ char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET return vim_strsave(fenc_default); } - /* copy "enc" to allocated memory, with room for two '-' */ + // copy "enc" to allocated memory, with room for two '-' char_u *r = xmalloc(STRLEN(enc) + 3); - /* Make it all lower case and replace '_' with '-'. */ + // Make it all lower case and replace '_' with '-'. p = r; for (s = enc; *s != NUL; ++s) { - if (*s == '_') + if (*s == '_') { *p++ = '-'; - else + } else { *p++ = TOLOWER_ASC(*s); + } } *p = NUL; - /* Skip "2byte-" and "8bit-". */ + // Skip "2byte-" and "8bit-". p = enc_skip(r); - /* Change "microsoft-cp" to "cp". Used in some spell files. */ - if (STRNCMP(p, "microsoft-cp", 12) == 0) + // Change "microsoft-cp" to "cp". Used in some spell files. + if (STRNCMP(p, "microsoft-cp", 12) == 0) { STRMOVE(p, p + 10); + } - /* "iso8859" -> "iso-8859" */ + // "iso8859" -> "iso-8859" if (STRNCMP(p, "iso8859", 7) == 0) { STRMOVE(p + 4, p + 3); p[3] = '-'; } - /* "iso-8859n" -> "iso-8859-n" */ + // "iso-8859n" -> "iso-8859-n" if (STRNCMP(p, "iso-8859", 8) == 0 && p[8] != '-') { STRMOVE(p + 9, p + 8); p[8] = '-'; } - /* "latin-N" -> "latinN" */ - if (STRNCMP(p, "latin-", 6) == 0) + // "latin-N" -> "latinN" + if (STRNCMP(p, "latin-", 6) == 0) { STRMOVE(p + 5, p + 6); + } if (enc_canon_search(p) >= 0) { - /* canonical name can be used unmodified */ - if (p != r) + // canonical name can be used unmodified + if (p != r) { STRMOVE(r, p); + } } else if ((i = enc_alias_search(p)) >= 0) { - /* alias recognized, get canonical name */ + // alias recognized, get canonical name xfree(r); r = vim_strsave((char_u *)enc_canon_table[i].name); } @@ -2127,9 +2185,11 @@ static int enc_alias_search(char_u *name) { int i; - for (i = 0; enc_alias_table[i].name != NULL; ++i) - if (STRCMP(name, enc_alias_table[i].name) == 0) + for (i = 0; enc_alias_table[i].name != NULL; ++i) { + if (STRCMP(name, enc_alias_table[i].name) == 0) { return enc_alias_table[i].canon; + } + } return -1; } @@ -2219,13 +2279,13 @@ void * my_iconv_open(char_u *to, char_u *from) iconv_t fd; #define ICONV_TESTLEN 400 char_u tobuf[ICONV_TESTLEN]; - char *p; + char *p; size_t tolen; static WorkingStatus iconv_working = kUnknown; - if (iconv_working == kBroken) - return (void *)-1; /* detected a broken iconv() previously */ - + if (iconv_working == kBroken) { + return (void *)-1; // detected a broken iconv() previously + } fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from)); if (fd != (iconv_t)-1 && iconv_working == kUnknown) { @@ -2243,8 +2303,9 @@ void * my_iconv_open(char_u *to, char_u *from) iconv_working = kBroken; iconv_close(fd); fd = (iconv_t)-1; - } else + } else { iconv_working = kWorking; + } } return (void *)fd; @@ -2257,17 +2318,17 @@ void * my_iconv_open(char_u *to, char_u *from) * Returns the converted string in allocated memory. NULL for an error. * If resultlenp is not NULL, sets it to the result length in bytes. */ -static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, - size_t slen, size_t *unconvlenp, size_t *resultlenp) +static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen, + size_t *unconvlenp, size_t *resultlenp) { - const char *from; + const char *from; size_t fromlen; - char *to; + char *to; size_t tolen; size_t len = 0; size_t done = 0; - char_u *result = NULL; - char_u *p; + char_u *result = NULL; + char_u *p; int l; from = (char *)str; @@ -2278,8 +2339,9 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, * increase the buffer size. */ len = len + fromlen * 2 + 40; p = xmalloc(len); - if (done > 0) + if (done > 0) { memmove(p, result, done); + } xfree(result); result = p; } @@ -2327,8 +2389,9 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, done = to - (char *)result; } - if (resultlenp != NULL && result != NULL) + if (resultlenp != NULL && result != NULL) { *resultlenp = (size_t)(to - (char *)result); + } return result; } @@ -2353,8 +2416,8 @@ int convert_setup(vimconv_T *vcp, char_u *from, char_u *to) /// As convert_setup(), but only when from_unicode_is_utf8 is true will all /// "from" unicode charsets be considered utf-8. Same for "to". -int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, - char_u *to, bool to_unicode_is_utf8) +int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, char_u *to, + bool to_unicode_is_utf8) { int from_prop; int to_prop; @@ -2369,51 +2432,54 @@ int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, # endif *vcp = (vimconv_T)MBYTE_NONE_CONV; - /* No conversion when one of the names is empty or they are equal. */ + // No conversion when one of the names is empty or they are equal. if (from == NULL || *from == NUL || to == NULL || *to == NUL - || STRCMP(from, to) == 0) + || STRCMP(from, to) == 0) { return OK; + } from_prop = enc_canon_props(from); to_prop = enc_canon_props(to); - if (from_unicode_is_utf8) + if (from_unicode_is_utf8) { from_is_utf8 = from_prop & ENC_UNICODE; - else + } else { from_is_utf8 = from_prop == ENC_UNICODE; - if (to_unicode_is_utf8) + } + if (to_unicode_is_utf8) { to_is_utf8 = to_prop & ENC_UNICODE; - else + } else { to_is_utf8 = to_prop == ENC_UNICODE; + } if ((from_prop & ENC_LATIN1) && to_is_utf8) { - /* Internal latin1 -> utf-8 conversion. */ + // Internal latin1 -> utf-8 conversion. vcp->vc_type = CONV_TO_UTF8; - vcp->vc_factor = 2; /* up to twice as long */ + vcp->vc_factor = 2; // up to twice as long } else if ((from_prop & ENC_LATIN9) && to_is_utf8) { - /* Internal latin9 -> utf-8 conversion. */ + // Internal latin9 -> utf-8 conversion. vcp->vc_type = CONV_9_TO_UTF8; - vcp->vc_factor = 3; /* up to three as long (euro sign) */ + vcp->vc_factor = 3; // up to three as long (euro sign) } else if (from_is_utf8 && (to_prop & ENC_LATIN1)) { - /* Internal utf-8 -> latin1 conversion. */ + // Internal utf-8 -> latin1 conversion. vcp->vc_type = CONV_TO_LATIN1; } else if (from_is_utf8 && (to_prop & ENC_LATIN9)) { - /* Internal utf-8 -> latin9 conversion. */ + // Internal utf-8 -> latin9 conversion. vcp->vc_type = CONV_TO_LATIN9; } # ifdef HAVE_ICONV else { // NOLINT(readability/braces) // Use iconv() for conversion. - vcp->vc_fd = (iconv_t)my_iconv_open( - to_is_utf8 ? (char_u *)"utf-8" : to, - from_is_utf8 ? (char_u *)"utf-8" : from); + vcp->vc_fd = (iconv_t)my_iconv_open(to_is_utf8 ? (char_u *)"utf-8" : to, + from_is_utf8 ? (char_u *)"utf-8" : from); if (vcp->vc_fd != (iconv_t)-1) { vcp->vc_type = CONV_ICONV; - vcp->vc_factor = 4; /* could be longer too... */ + vcp->vc_factor = 4; // could be longer too... } } # endif - if (vcp->vc_type == CONV_NONE) + if (vcp->vc_type == CONV_NONE) { return FAIL; + } return OK; } @@ -2435,129 +2501,153 @@ char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp) * an incomplete sequence at the end it is not converted and "*unconvlenp" is * set to the number of remaining bytes. */ -char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr, - size_t *lenp, size_t *unconvlenp) +char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp, + size_t *unconvlenp) { - char_u *retval = NULL; - char_u *d; + char_u *retval = NULL; + char_u *d; int l; int c; size_t len; - if (lenp == NULL) + if (lenp == NULL) { len = STRLEN(ptr); - else + } else { len = *lenp; - if (len == 0) + } + if (len == 0) { return vim_strsave((char_u *)""); + } switch (vcp->vc_type) { - case CONV_TO_UTF8: /* latin1 to utf-8 conversion */ - retval = xmalloc(len * 2 + 1); - d = retval; - for (size_t i = 0; i < len; ++i) { - c = ptr[i]; - if (c < 0x80) - *d++ = c; - else { - *d++ = 0xc0 + ((unsigned)c >> 6); - *d++ = 0x80 + (c & 0x3f); - } + case CONV_TO_UTF8: // latin1 to utf-8 conversion + retval = xmalloc(len * 2 + 1); + d = retval; + for (size_t i = 0; i < len; ++i) { + c = ptr[i]; + if (c < 0x80) { + *d++ = c; + } else { + *d++ = 0xc0 + ((unsigned)c >> 6); + *d++ = 0x80 + (c & 0x3f); } - *d = NUL; - if (lenp != NULL) - *lenp = (size_t)(d - retval); - break; + } + *d = NUL; + if (lenp != NULL) { + *lenp = (size_t)(d - retval); + } + break; - case CONV_9_TO_UTF8: /* latin9 to utf-8 conversion */ - retval = xmalloc(len * 3 + 1); - d = retval; - for (size_t i = 0; i < len; ++i) { - c = ptr[i]; - switch (c) { - case 0xa4: c = 0x20ac; break; /* euro */ - case 0xa6: c = 0x0160; break; /* S hat */ - case 0xa8: c = 0x0161; break; /* S -hat */ - case 0xb4: c = 0x017d; break; /* Z hat */ - case 0xb8: c = 0x017e; break; /* Z -hat */ - case 0xbc: c = 0x0152; break; /* OE */ - case 0xbd: c = 0x0153; break; /* oe */ - case 0xbe: c = 0x0178; break; /* Y */ - } - d += utf_char2bytes(c, d); + case CONV_9_TO_UTF8: // latin9 to utf-8 conversion + retval = xmalloc(len * 3 + 1); + d = retval; + for (size_t i = 0; i < len; ++i) { + c = ptr[i]; + switch (c) { + case 0xa4: + c = 0x20ac; break; // euro + case 0xa6: + c = 0x0160; break; // S hat + case 0xa8: + c = 0x0161; break; // S -hat + case 0xb4: + c = 0x017d; break; // Z hat + case 0xb8: + c = 0x017e; break; // Z -hat + case 0xbc: + c = 0x0152; break; // OE + case 0xbd: + c = 0x0153; break; // oe + case 0xbe: + c = 0x0178; break; // Y } - *d = NUL; - if (lenp != NULL) - *lenp = (size_t)(d - retval); - break; + d += utf_char2bytes(c, d); + } + *d = NUL; + if (lenp != NULL) { + *lenp = (size_t)(d - retval); + } + break; - case CONV_TO_LATIN1: /* utf-8 to latin1 conversion */ - case CONV_TO_LATIN9: /* utf-8 to latin9 conversion */ - retval = xmalloc(len + 1); - d = retval; - for (size_t i = 0; i < len; ++i) { - l = utf_ptr2len_len(ptr + i, len - i); - if (l == 0) - *d++ = NUL; - else if (l == 1) { - uint8_t l_w = utf8len_tab_zero[ptr[i]]; - - if (l_w == 0) { - /* Illegal utf-8 byte cannot be converted */ + case CONV_TO_LATIN1: // utf-8 to latin1 conversion + case CONV_TO_LATIN9: // utf-8 to latin9 conversion + retval = xmalloc(len + 1); + d = retval; + for (size_t i = 0; i < len; ++i) { + l = utf_ptr2len_len(ptr + i, len - i); + if (l == 0) { + *d++ = NUL; + } else if (l == 1) { + uint8_t l_w = utf8len_tab_zero[ptr[i]]; + + if (l_w == 0) { + // Illegal utf-8 byte cannot be converted + xfree(retval); + return NULL; + } + if (unconvlenp != NULL && l_w > len - i) { + // Incomplete sequence at the end. + *unconvlenp = len - i; + break; + } + *d++ = ptr[i]; + } else { + c = utf_ptr2char(ptr + i); + if (vcp->vc_type == CONV_TO_LATIN9) { + switch (c) { + case 0x20ac: + c = 0xa4; break; // euro + case 0x0160: + c = 0xa6; break; // S hat + case 0x0161: + c = 0xa8; break; // S -hat + case 0x017d: + c = 0xb4; break; // Z hat + case 0x017e: + c = 0xb8; break; // Z -hat + case 0x0152: + c = 0xbc; break; // OE + case 0x0153: + c = 0xbd; break; // oe + case 0x0178: + c = 0xbe; break; // Y + case 0xa4: + case 0xa6: + case 0xa8: + case 0xb4: + case 0xb8: + case 0xbc: + case 0xbd: + case 0xbe: + c = 0x100; break; // not in latin9 + } + } + if (!utf_iscomposing(c)) { // skip composing chars + if (c < 0x100) { + *d++ = c; + } else if (vcp->vc_fail) { xfree(retval); return NULL; - } - if (unconvlenp != NULL && l_w > len - i) { - /* Incomplete sequence at the end. */ - *unconvlenp = len - i; - break; - } - *d++ = ptr[i]; - } else { - c = utf_ptr2char(ptr + i); - if (vcp->vc_type == CONV_TO_LATIN9) - switch (c) { - case 0x20ac: c = 0xa4; break; /* euro */ - case 0x0160: c = 0xa6; break; /* S hat */ - case 0x0161: c = 0xa8; break; /* S -hat */ - case 0x017d: c = 0xb4; break; /* Z hat */ - case 0x017e: c = 0xb8; break; /* Z -hat */ - case 0x0152: c = 0xbc; break; /* OE */ - case 0x0153: c = 0xbd; break; /* oe */ - case 0x0178: c = 0xbe; break; /* Y */ - case 0xa4: - case 0xa6: - case 0xa8: - case 0xb4: - case 0xb8: - case 0xbc: - case 0xbd: - case 0xbe: c = 0x100; break; /* not in latin9 */ - } - if (!utf_iscomposing(c)) { /* skip composing chars */ - if (c < 0x100) - *d++ = c; - else if (vcp->vc_fail) { - xfree(retval); - return NULL; - } else { - *d++ = 0xbf; - if (utf_char2cells(c) > 1) - *d++ = '?'; + } else { + *d++ = 0xbf; + if (utf_char2cells(c) > 1) { + *d++ = '?'; } } - i += l - 1; } + i += l - 1; } - *d = NUL; - if (lenp != NULL) - *lenp = (size_t)(d - retval); - break; + } + *d = NUL; + if (lenp != NULL) { + *lenp = (size_t)(d - retval); + } + break; # ifdef HAVE_ICONV - case CONV_ICONV: // conversion with vcp->vc_fd - retval = iconv_string(vcp, ptr, len, unconvlenp, lenp); - break; + case CONV_ICONV: // conversion with vcp->vc_fd + retval = iconv_string(vcp, ptr, len, unconvlenp, lenp); + break; # endif } |