aboutsummaryrefslogtreecommitdiff
path: root/src/nvim/mbyte.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/nvim/mbyte.c')
-rw-r--r--src/nvim/mbyte.c808
1 files changed, 449 insertions, 359 deletions
diff --git a/src/nvim/mbyte.c b/src/nvim/mbyte.c
index cba372b9d3..fea1ab77a2 100644
--- a/src/nvim/mbyte.c
+++ b/src/nvim/mbyte.c
@@ -31,30 +31,30 @@
#include <wchar.h>
#include <wctype.h>
-#include "nvim/vim.h"
#include "nvim/ascii.h"
+#include "nvim/vim.h"
#ifdef HAVE_LOCALE_H
# include <locale.h>
#endif
-#include "nvim/eval.h"
-#include "nvim/path.h"
-#include "nvim/iconv.h"
-#include "nvim/mbyte.h"
+#include "nvim/arabic.h"
#include "nvim/charset.h"
#include "nvim/cursor.h"
+#include "nvim/eval.h"
#include "nvim/fileio.h"
#include "nvim/func_attr.h"
+#include "nvim/iconv.h"
+#include "nvim/mark.h"
+#include "nvim/mbyte.h"
#include "nvim/memline.h"
+#include "nvim/memory.h"
#include "nvim/message.h"
#include "nvim/misc1.h"
-#include "nvim/memory.h"
#include "nvim/option.h"
+#include "nvim/os/os.h"
+#include "nvim/path.h"
#include "nvim/screen.h"
#include "nvim/spell.h"
#include "nvim/strings.h"
-#include "nvim/os/os.h"
-#include "nvim/arabic.h"
-#include "nvim/mark.h"
typedef struct {
int rangeStart;
@@ -70,6 +70,7 @@ struct interval {
#ifdef INCLUDE_GENERATED_DECLARATIONS
# include "mbyte.c.generated.h"
+
# include "unicode_tables.generated.h"
#endif
@@ -127,131 +128,131 @@ static struct
enc_canon_table[] =
{
#define IDX_LATIN_1 0
- {"latin1", ENC_8BIT + ENC_LATIN1, 1252},
+ { "latin1", ENC_8BIT + ENC_LATIN1, 1252 },
#define IDX_ISO_2 1
- {"iso-8859-2", ENC_8BIT, 0},
+ { "iso-8859-2", ENC_8BIT, 0 },
#define IDX_ISO_3 2
- {"iso-8859-3", ENC_8BIT, 0},
+ { "iso-8859-3", ENC_8BIT, 0 },
#define IDX_ISO_4 3
- {"iso-8859-4", ENC_8BIT, 0},
+ { "iso-8859-4", ENC_8BIT, 0 },
#define IDX_ISO_5 4
- {"iso-8859-5", ENC_8BIT, 0},
+ { "iso-8859-5", ENC_8BIT, 0 },
#define IDX_ISO_6 5
- {"iso-8859-6", ENC_8BIT, 0},
+ { "iso-8859-6", ENC_8BIT, 0 },
#define IDX_ISO_7 6
- {"iso-8859-7", ENC_8BIT, 0},
+ { "iso-8859-7", ENC_8BIT, 0 },
#define IDX_ISO_8 7
- {"iso-8859-8", ENC_8BIT, 0},
+ { "iso-8859-8", ENC_8BIT, 0 },
#define IDX_ISO_9 8
- {"iso-8859-9", ENC_8BIT, 0},
+ { "iso-8859-9", ENC_8BIT, 0 },
#define IDX_ISO_10 9
- {"iso-8859-10", ENC_8BIT, 0},
+ { "iso-8859-10", ENC_8BIT, 0 },
#define IDX_ISO_11 10
- {"iso-8859-11", ENC_8BIT, 0},
+ { "iso-8859-11", ENC_8BIT, 0 },
#define IDX_ISO_13 11
- {"iso-8859-13", ENC_8BIT, 0},
+ { "iso-8859-13", ENC_8BIT, 0 },
#define IDX_ISO_14 12
- {"iso-8859-14", ENC_8BIT, 0},
+ { "iso-8859-14", ENC_8BIT, 0 },
#define IDX_ISO_15 13
- {"iso-8859-15", ENC_8BIT + ENC_LATIN9, 0},
+ { "iso-8859-15", ENC_8BIT + ENC_LATIN9, 0 },
#define IDX_KOI8_R 14
- {"koi8-r", ENC_8BIT, 0},
+ { "koi8-r", ENC_8BIT, 0 },
#define IDX_KOI8_U 15
- {"koi8-u", ENC_8BIT, 0},
+ { "koi8-u", ENC_8BIT, 0 },
#define IDX_UTF8 16
- {"utf-8", ENC_UNICODE, 0},
+ { "utf-8", ENC_UNICODE, 0 },
#define IDX_UCS2 17
- {"ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0},
+ { "ucs-2", ENC_UNICODE + ENC_ENDIAN_B + ENC_2BYTE, 0 },
#define IDX_UCS2LE 18
- {"ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0},
+ { "ucs-2le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2BYTE, 0 },
#define IDX_UTF16 19
- {"utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0},
+ { "utf-16", ENC_UNICODE + ENC_ENDIAN_B + ENC_2WORD, 0 },
#define IDX_UTF16LE 20
- {"utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0},
+ { "utf-16le", ENC_UNICODE + ENC_ENDIAN_L + ENC_2WORD, 0 },
#define IDX_UCS4 21
- {"ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0},
+ { "ucs-4", ENC_UNICODE + ENC_ENDIAN_B + ENC_4BYTE, 0 },
#define IDX_UCS4LE 22
- {"ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0},
+ { "ucs-4le", ENC_UNICODE + ENC_ENDIAN_L + ENC_4BYTE, 0 },
- /* For debugging DBCS encoding on Unix. */
+ // For debugging DBCS encoding on Unix.
#define IDX_DEBUG 23
- {"debug", ENC_DBCS, DBCS_DEBUG},
+ { "debug", ENC_DBCS, DBCS_DEBUG },
#define IDX_EUC_JP 24
- {"euc-jp", ENC_DBCS, DBCS_JPNU},
+ { "euc-jp", ENC_DBCS, DBCS_JPNU },
#define IDX_SJIS 25
- {"sjis", ENC_DBCS, DBCS_JPN},
+ { "sjis", ENC_DBCS, DBCS_JPN },
#define IDX_EUC_KR 26
- {"euc-kr", ENC_DBCS, DBCS_KORU},
+ { "euc-kr", ENC_DBCS, DBCS_KORU },
#define IDX_EUC_CN 27
- {"euc-cn", ENC_DBCS, DBCS_CHSU},
+ { "euc-cn", ENC_DBCS, DBCS_CHSU },
#define IDX_EUC_TW 28
- {"euc-tw", ENC_DBCS, DBCS_CHTU},
+ { "euc-tw", ENC_DBCS, DBCS_CHTU },
#define IDX_BIG5 29
- {"big5", ENC_DBCS, DBCS_CHT},
+ { "big5", ENC_DBCS, DBCS_CHT },
/* MS-DOS and MS-Windows codepages are included here, so that they can be
* used on Unix too. Most of them are similar to ISO-8859 encodings, but
* not exactly the same. */
#define IDX_CP437 30
- {"cp437", ENC_8BIT, 437}, /* like iso-8859-1 */
+ { "cp437", ENC_8BIT, 437 }, // like iso-8859-1
#define IDX_CP737 31
- {"cp737", ENC_8BIT, 737}, /* like iso-8859-7 */
+ { "cp737", ENC_8BIT, 737 }, // like iso-8859-7
#define IDX_CP775 32
- {"cp775", ENC_8BIT, 775}, /* Baltic */
+ { "cp775", ENC_8BIT, 775 }, // Baltic
#define IDX_CP850 33
- {"cp850", ENC_8BIT, 850}, /* like iso-8859-4 */
+ { "cp850", ENC_8BIT, 850 }, // like iso-8859-4
#define IDX_CP852 34
- {"cp852", ENC_8BIT, 852}, /* like iso-8859-1 */
+ { "cp852", ENC_8BIT, 852 }, // like iso-8859-1
#define IDX_CP855 35
- {"cp855", ENC_8BIT, 855}, /* like iso-8859-2 */
+ { "cp855", ENC_8BIT, 855 }, // like iso-8859-2
#define IDX_CP857 36
- {"cp857", ENC_8BIT, 857}, /* like iso-8859-5 */
+ { "cp857", ENC_8BIT, 857 }, // like iso-8859-5
#define IDX_CP860 37
- {"cp860", ENC_8BIT, 860}, /* like iso-8859-9 */
+ { "cp860", ENC_8BIT, 860 }, // like iso-8859-9
#define IDX_CP861 38
- {"cp861", ENC_8BIT, 861}, /* like iso-8859-1 */
+ { "cp861", ENC_8BIT, 861 }, // like iso-8859-1
#define IDX_CP862 39
- {"cp862", ENC_8BIT, 862}, /* like iso-8859-1 */
+ { "cp862", ENC_8BIT, 862 }, // like iso-8859-1
#define IDX_CP863 40
- {"cp863", ENC_8BIT, 863}, /* like iso-8859-8 */
+ { "cp863", ENC_8BIT, 863 }, // like iso-8859-8
#define IDX_CP865 41
- {"cp865", ENC_8BIT, 865}, /* like iso-8859-1 */
+ { "cp865", ENC_8BIT, 865 }, // like iso-8859-1
#define IDX_CP866 42
- {"cp866", ENC_8BIT, 866}, /* like iso-8859-5 */
+ { "cp866", ENC_8BIT, 866 }, // like iso-8859-5
#define IDX_CP869 43
- {"cp869", ENC_8BIT, 869}, /* like iso-8859-7 */
+ { "cp869", ENC_8BIT, 869 }, // like iso-8859-7
#define IDX_CP874 44
- {"cp874", ENC_8BIT, 874}, /* Thai */
+ { "cp874", ENC_8BIT, 874 }, // Thai
#define IDX_CP932 45
- {"cp932", ENC_DBCS, DBCS_JPN},
+ { "cp932", ENC_DBCS, DBCS_JPN },
#define IDX_CP936 46
- {"cp936", ENC_DBCS, DBCS_CHS},
+ { "cp936", ENC_DBCS, DBCS_CHS },
#define IDX_CP949 47
- {"cp949", ENC_DBCS, DBCS_KOR},
+ { "cp949", ENC_DBCS, DBCS_KOR },
#define IDX_CP950 48
- {"cp950", ENC_DBCS, DBCS_CHT},
+ { "cp950", ENC_DBCS, DBCS_CHT },
#define IDX_CP1250 49
- {"cp1250", ENC_8BIT, 1250}, /* Czech, Polish, etc. */
+ { "cp1250", ENC_8BIT, 1250 }, // Czech, Polish, etc.
#define IDX_CP1251 50
- {"cp1251", ENC_8BIT, 1251}, /* Cyrillic */
- /* cp1252 is considered to be equal to latin1 */
+ { "cp1251", ENC_8BIT, 1251 }, // Cyrillic
+ // cp1252 is considered to be equal to latin1
#define IDX_CP1253 51
- {"cp1253", ENC_8BIT, 1253}, /* Greek */
+ { "cp1253", ENC_8BIT, 1253 }, // Greek
#define IDX_CP1254 52
- {"cp1254", ENC_8BIT, 1254}, /* Turkish */
+ { "cp1254", ENC_8BIT, 1254 }, // Turkish
#define IDX_CP1255 53
- {"cp1255", ENC_8BIT, 1255}, /* Hebrew */
+ { "cp1255", ENC_8BIT, 1255 }, // Hebrew
#define IDX_CP1256 54
- {"cp1256", ENC_8BIT, 1256}, /* Arabic */
+ { "cp1256", ENC_8BIT, 1256 }, // Arabic
#define IDX_CP1257 55
- {"cp1257", ENC_8BIT, 1257}, /* Baltic */
+ { "cp1257", ENC_8BIT, 1257 }, // Baltic
#define IDX_CP1258 56
- {"cp1258", ENC_8BIT, 1258}, /* Vietnamese */
+ { "cp1258", ENC_8BIT, 1258 }, // Vietnamese
#define IDX_MACROMAN 57
- {"macroman", ENC_8BIT + ENC_MACROMAN, 0}, /* Mac OS */
+ { "macroman", ENC_8BIT + ENC_MACROMAN, 0 }, // Mac OS
#define IDX_HPROMAN8 58
- {"hp-roman8", ENC_8BIT, 0}, /* HP Roman8 */
+ { "hp-roman8", ENC_8BIT, 0 }, // HP Roman8
#define IDX_COUNT 59
};
@@ -336,9 +337,11 @@ static int enc_canon_search(const char_u *name)
{
int i;
- for (i = 0; i < IDX_COUNT; ++i)
- if (STRCMP(name, enc_canon_table[i].name) == 0)
+ for (i = 0; i < IDX_COUNT; ++i) {
+ if (STRCMP(name, enc_canon_table[i].name) == 0) {
return i;
+ }
+ }
return -1;
}
@@ -353,12 +356,13 @@ int enc_canon_props(const char_u *name)
int i;
i = enc_canon_search(name);
- if (i >= 0)
+ if (i >= 0) {
return enc_canon_table[i].prop;
- if (STRNCMP(name, "2byte-", 6) == 0)
+ } else if (STRNCMP(name, "2byte-", 6) == 0) {
return ENC_DBCS;
- if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0)
+ } else if (STRNCMP(name, "8bit-", 5) == 0 || STRNCMP(name, "iso-8859-", 9) == 0) {
return ENC_8BIT;
+ }
return 0;
}
@@ -436,21 +440,23 @@ static bool intable(const struct interval *table, size_t n_items, int c)
{
int mid, bot, top;
- /* first quick check for Latin1 etc. characters */
- if (c < table[0].first)
+ // first quick check for Latin1 etc. characters
+ if (c < table[0].first) {
return false;
+ }
- /* binary search in table */
+ // binary search in table
bot = 0;
top = (int)(n_items - 1);
while (top >= bot) {
mid = (bot + top) / 2;
- if (table[mid].last < c)
+ if (table[mid].last < c) {
bot = mid + 1;
- else if (table[mid].first > c)
+ } else if (table[mid].first > c) {
top = mid - 1;
- else
+ } else {
return true;
+ }
}
return false;
}
@@ -512,12 +518,14 @@ int utf_ptr2cells(const char_u *p)
// Need to convert to a character number.
if (*p >= 0x80) {
c = utf_ptr2char(p);
- /* An illegal byte is displayed as <xx>. */
- if (utf_ptr2len(p) == 1 || c == NUL)
+ // An illegal byte is displayed as <xx>.
+ if (utf_ptr2len(p) == 1 || c == NUL) {
return 4;
- /* If the char is ASCII it must be an overlong sequence. */
- if (c < 0x80)
+ }
+ // If the char is ASCII it must be an overlong sequence.
+ if (c < 0x80) {
return char2cells(c);
+ }
return utf_char2cells(c);
}
return 1;
@@ -529,17 +537,20 @@ int utf_ptr2cells_len(const char_u *p, int size)
{
int c;
- /* Need to convert to a wide character. */
+ // Need to convert to a wide character.
if (size > 0 && *p >= 0x80) {
- if (utf_ptr2len_len(p, size) < utf8len_tab[*p])
- return 1; /* truncated */
+ if (utf_ptr2len_len(p, size) < utf8len_tab[*p]) {
+ return 1; // truncated
+ }
c = utf_ptr2char(p);
- /* An illegal byte is displayed as <xx>. */
- if (utf_ptr2len(p) == 1 || c == NUL)
+ // An illegal byte is displayed as <xx>.
+ if (utf_ptr2len(p) == 1 || c == NUL) {
return 4;
- /* If the char is ASCII it must be an overlong sequence. */
- if (c < 0x80)
+ }
+ // If the char is ASCII it must be an overlong sequence.
+ if (c < 0x80) {
return char2cells(c);
+ }
return utf_char2cells(c);
}
return 1;
@@ -651,13 +662,14 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n)
{
int c;
- if (*n == 0) /* end of buffer */
+ if (*n == 0) { // end of buffer
return 0;
+ }
uint8_t k = utf8len_tab_zero[**s];
if (k == 1) {
- /* ASCII character or NUL */
+ // ASCII character or NUL
(*n)--;
return *(*s)++;
}
@@ -674,14 +686,14 @@ static int utf_safe_read_char_adv(const char_u **s, size_t *n)
* U+00C3 (UTF-8: 0xC3 0x83), so need to check that special case too.
* It's safe even if n=1, else we would have k=2 > n. */
if (c != (int)(**s) || (c == 0xC3 && (*s)[1] == 0x83)) {
- /* byte sequence was successfully decoded */
+ // byte sequence was successfully decoded
*s += k;
*n -= k;
return c;
}
}
- /* byte sequence is incomplete or illegal */
+ // byte sequence is incomplete or illegal
return -1;
}
@@ -721,10 +733,12 @@ bool utf_composinglike(const char_u *p1, const char_u *p2)
int c2;
c2 = utf_ptr2char(p2);
- if (utf_iscomposing(c2))
+ if (utf_iscomposing(c2)) {
return true;
- if (!arabic_maycombine(c2))
+ }
+ if (!arabic_maycombine(c2)) {
return false;
+ }
return arabic_combine(utf_ptr2char(p1), c2);
}
@@ -746,23 +760,26 @@ int utfc_ptr2char(const char_u *p, int *pcc)
c = utf_ptr2char(p);
len = utf_ptr2len(p);
- /* Only accept a composing char when the first char isn't illegal. */
+ // Only accept a composing char when the first char isn't illegal.
if ((len > 1 || *p < 0x80)
&& p[len] >= 0x80
&& UTF_COMPOSINGLIKE(p, p + len)) {
cc = utf_ptr2char(p + len);
for (;; ) {
pcc[i++] = cc;
- if (i == MAX_MCO)
+ if (i == MAX_MCO) {
break;
+ }
len += utf_ptr2len(p + len);
- if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len)))
+ if (p[len] < 0x80 || !utf_iscomposing(cc = utf_ptr2char(p + len))) {
break;
+ }
}
}
- if (i < MAX_MCO) /* last composing char must be 0 */
+ if (i < MAX_MCO) { // last composing char must be 0
pcc[i] = 0;
+ }
return c;
}
@@ -855,15 +872,19 @@ int utf_ptr2len_len(const char_u *p, int size)
int m;
len = utf8len_tab[*p];
- if (len == 1)
- return 1; /* NUL, ascii or illegal lead byte */
- if (len > size)
- m = size; /* incomplete byte sequence. */
- else
+ if (len == 1) {
+ return 1; // NUL, ascii or illegal lead byte
+ }
+ if (len > size) {
+ m = size; // incomplete byte sequence.
+ } else {
m = len;
- for (i = 1; i < m; ++i)
- if ((p[i] & 0xc0) != 0x80)
+ }
+ for (i = 1; i < m; ++i) {
+ if ((p[i] & 0xc0) != 0x80) {
return 1;
+ }
+ }
return len;
}
@@ -915,17 +936,20 @@ int utfc_ptr2len_len(const char_u *p, int size)
int len;
int prevlen;
- if (size < 1 || *p == NUL)
+ if (size < 1 || *p == NUL) {
return 0;
- if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) /* be quick for ASCII */
+ }
+ if (p[0] < 0x80 && (size == 1 || p[1] < 0x80)) { // be quick for ASCII
return 1;
+ }
- /* Skip over first UTF-8 char, stopping at a NUL byte. */
+ // Skip over first UTF-8 char, stopping at a NUL byte.
len = utf_ptr2len_len(p, size);
- /* Check for illegal byte and incomplete byte sequence. */
- if ((len == 1 && p[0] >= 0x80) || len > size)
+ // Check for illegal byte and incomplete byte sequence.
+ if ((len == 1 && p[0] >= 0x80) || len > size) {
return 1;
+ }
/*
* Check for composing characters. We can handle only the first six, but
@@ -935,21 +959,24 @@ int utfc_ptr2len_len(const char_u *p, int size)
while (len < size) {
int len_next_char;
- if (p[len] < 0x80)
+ if (p[len] < 0x80) {
break;
+ }
/*
* Next character length should not go beyond size to ensure that
* UTF_COMPOSINGLIKE(...) does not read beyond size.
*/
len_next_char = utf_ptr2len_len(p + len, size - len);
- if (len_next_char > size - len)
+ if (len_next_char > size - len) {
break;
+ }
- if (!UTF_COMPOSINGLIKE(p + prevlen, p + len))
+ if (!UTF_COMPOSINGLIKE(p + prevlen, p + len)) {
break;
+ }
- /* Skip over composing char */
+ // Skip over composing char
prevlen = len;
len += len_next_char;
}
@@ -1043,9 +1070,9 @@ bool utf_printable(int c)
* 0xd800-0xdfff is reserved for UTF-16, actually illegal. */
static struct interval nonprint[] =
{
- {0x070f, 0x070f}, {0x180b, 0x180e}, {0x200b, 0x200f}, {0x202a, 0x202e},
- {0x206a, 0x206f}, {0xd800, 0xdfff}, {0xfeff, 0xfeff}, {0xfff9, 0xfffb},
- {0xfffe, 0xffff}
+ { 0x070f, 0x070f }, { 0x180b, 0x180e }, { 0x200b, 0x200f }, { 0x202a, 0x202e },
+ { 0x206a, 0x206f }, { 0xd800, 0xdfff }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb },
+ { 0xfffe, 0xffff }
};
return !intable(nonprint, ARRAY_SIZE(nonprint), c);
@@ -1065,7 +1092,7 @@ int utf_class(const int c)
int utf_class_tab(const int c, const uint64_t *const chartab)
{
- /* sorted list of non-overlapping intervals */
+ // sorted list of non-overlapping intervals
static struct clinterval {
unsigned int first;
unsigned int last;
@@ -1147,7 +1174,7 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
int top = ARRAY_SIZE(classes) - 1;
int mid;
- /* First quick check for Latin1 characters, use 'iskeyword'. */
+ // First quick check for Latin1 characters, use 'iskeyword'.
if (c < 0x100) {
if (c == ' ' || c == '\t' || c == NUL || c == 0xa0) {
return 0; // blank
@@ -1158,15 +1185,16 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 1; // punctuation
}
- /* binary search in table */
+ // binary search in table
while (top >= bot) {
mid = (bot + top) / 2;
- if (classes[mid].last < (unsigned int)c)
+ if (classes[mid].last < (unsigned int)c) {
bot = mid + 1;
- else if (classes[mid].first > (unsigned int)c)
+ } else if (classes[mid].first > (unsigned int)c) {
top = mid - 1;
- else
+ } else {
return (int)classes[mid].class;
+ }
}
// emoji
@@ -1174,7 +1202,7 @@ int utf_class_tab(const int c, const uint64_t *const chartab)
return 3;
}
- /* most other characters are "word" characters */
+ // most other characters are "word" characters
return 2;
}
@@ -1191,25 +1219,27 @@ bool utf_ambiguous_width(int c)
*/
static int utf_convert(int a, const convertStruct *const table, size_t n_items)
{
- size_t start, mid, end; /* indices into table */
+ size_t start, mid, end; // indices into table
start = 0;
end = n_items;
while (start < end) {
- /* need to search further */
+ // need to search further
mid = (end + start) / 2;
- if (table[mid].rangeEnd < a)
+ if (table[mid].rangeEnd < a) {
start = mid + 1;
- else
+ } else {
end = mid;
+ }
}
if (start < n_items
&& table[start].rangeStart <= a
&& a <= table[start].rangeEnd
- && (a - table[start].rangeStart) % table[start].step == 0)
+ && (a - table[start].rangeStart) % table[start].step == 0) {
return a + table[start].offset;
- else
+ } else {
return a;
+ }
}
/*
@@ -1234,21 +1264,24 @@ int utf_fold(int a)
/// simple case folding.
int mb_toupper(int a)
{
- /* If 'casemap' contains "keepascii" use ASCII style toupper(). */
- if (a < 128 && (cmp_flags & CMP_KEEPASCII))
+ // If 'casemap' contains "keepascii" use ASCII style toupper().
+ if (a < 128 && (cmp_flags & CMP_KEEPASCII)) {
return TOUPPER_ASC(a);
+ }
#if defined(__STDC_ISO_10646__)
- /* If towupper() is available and handles Unicode, use it. */
- if (!(cmp_flags & CMP_INTERNAL))
+ // If towupper() is available and handles Unicode, use it.
+ if (!(cmp_flags & CMP_INTERNAL)) {
return towupper(a);
+ }
#endif
- /* For characters below 128 use locale sensitive toupper(). */
- if (a < 128)
+ // For characters below 128 use locale sensitive toupper().
+ if (a < 128) {
return TOUPPER_LOC(a);
+ }
- /* For any other characters use the above mapping table. */
+ // For any other characters use the above mapping table.
return utf_convert(a, toUpper, ARRAY_SIZE(toUpper));
}
@@ -1262,21 +1295,24 @@ bool mb_islower(int a)
/// simple case folding.
int mb_tolower(int a)
{
- /* If 'casemap' contains "keepascii" use ASCII style tolower(). */
- if (a < 128 && (cmp_flags & CMP_KEEPASCII))
+ // If 'casemap' contains "keepascii" use ASCII style tolower().
+ if (a < 128 && (cmp_flags & CMP_KEEPASCII)) {
return TOLOWER_ASC(a);
+ }
#if defined(__STDC_ISO_10646__)
- /* If towlower() is available and handles Unicode, use it. */
- if (!(cmp_flags & CMP_INTERNAL))
+ // If towlower() is available and handles Unicode, use it.
+ if (!(cmp_flags & CMP_INTERNAL)) {
return towlower(a);
+ }
#endif
- /* For characters below 128 use locale sensitive tolower(). */
- if (a < 128)
+ // For characters below 128 use locale sensitive tolower().
+ if (a < 128) {
return TOLOWER_LOC(a);
+ }
- /* For any other characters use the above mapping table. */
+ // For any other characters use the above mapping table.
return utf_convert(a, toLower, ARRAY_SIZE(toLower));
}
@@ -1285,8 +1321,7 @@ bool mb_isupper(int a)
return mb_tolower(a) != a;
}
-static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1,
- size_t n2)
+static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1, size_t n2)
{
int c1, c2, cdiff;
char_u buffer[6];
@@ -1295,23 +1330,27 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1,
c1 = utf_safe_read_char_adv(&s1, &n1);
c2 = utf_safe_read_char_adv(&s2, &n2);
- if (c1 <= 0 || c2 <= 0)
+ if (c1 <= 0 || c2 <= 0) {
break;
+ }
- if (c1 == c2)
+ if (c1 == c2) {
continue;
+ }
cdiff = utf_fold(c1) - utf_fold(c2);
- if (cdiff != 0)
+ if (cdiff != 0) {
return cdiff;
+ }
}
- /* some string ended or has an incomplete/illegal character sequence */
+ // some string ended or has an incomplete/illegal character sequence
if (c1 == 0 || c2 == 0) {
- /* some string ended. shorter string is smaller */
- if (c1 == 0 && c2 == 0)
+ // some string ended. shorter string is smaller
+ if (c1 == 0 && c2 == 0) {
return 0;
+ }
return c1 == 0 ? -1 : 1;
}
@@ -1332,8 +1371,9 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1,
while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL) {
cdiff = (int)(*s1) - (int)(*s2);
- if (cdiff != 0)
+ if (cdiff != 0) {
return cdiff;
+ }
s1++;
s2++;
@@ -1341,19 +1381,22 @@ static int utf_strnicmp(const char_u *s1, const char_u *s2, size_t n1,
n2--;
}
- if (n1 > 0 && *s1 == NUL)
+ if (n1 > 0 && *s1 == NUL) {
n1 = 0;
- if (n2 > 0 && *s2 == NUL)
+ }
+ if (n2 > 0 && *s2 == NUL) {
n2 = 0;
+ }
- if (n1 == 0 && n2 == 0)
+ if (n1 == 0 && n2 == 0) {
return 0;
+ }
return n1 == 0 ? -1 : 1;
}
#ifdef WIN32
#ifndef CP_UTF8
-# define CP_UTF8 65001 /* magic number from winnls.h */
+# define CP_UTF8 65001 // magic number from winnls.h
#endif
/// Converts string from UTF-8 to UTF-16.
@@ -1453,8 +1496,7 @@ int utf16_to_utf8(const wchar_t *utf16, int utf16len, char **utf8)
/// @param len maximum length (an earlier NUL terminates)
/// @param[out] codepoints incremented with UTF-32 code point size
/// @param[out] codeunits incremented with UTF-16 code unit size
-void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
- size_t *codeunits)
+void mb_utflen(const char_u *s, size_t len, size_t *codepoints, size_t *codeunits)
FUNC_ATTR_NONNULL_ALL
{
size_t count = 0, extra = 0;
@@ -1473,8 +1515,7 @@ void mb_utflen(const char_u *s, size_t len, size_t *codepoints,
*codeunits += count + extra;
}
-ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len,
- size_t index, bool use_utf16_units)
+ssize_t mb_utf_index_to_bytes(const char_u *s, size_t len, size_t index, bool use_utf16_units)
FUNC_ATTR_NONNULL_ALL
{
size_t count = 0;
@@ -1537,7 +1578,7 @@ void show_utf8(void)
{
int len;
int rlen = 0;
- char_u *line;
+ char_u *line;
int clen;
int i;
@@ -1553,7 +1594,7 @@ void show_utf8(void)
clen = 0;
for (i = 0; i < len; ++i) {
if (clen == 0) {
- /* start of (composing) character, get its length */
+ // start of (composing) character, get its length
if (i > 0) {
STRCPY(IObuff + rlen, "+ ");
rlen += 2;
@@ -1561,11 +1602,12 @@ void show_utf8(void)
clen = utf_ptr2len(line + i);
}
sprintf((char *)IObuff + rlen, "%02x ",
- (line[i] == NL) ? NUL : line[i]); /* NUL is stored as NL */
+ (line[i] == NL) ? NUL : line[i]); // NUL is stored as NL
--clen;
rlen += (int)STRLEN(IObuff + rlen);
- if (rlen > IOSIZE - 20)
+ if (rlen > IOSIZE - 20) {
break;
+ }
}
msg(IObuff);
@@ -1579,42 +1621,49 @@ int utf_head_off(const char_u *base, const char_u *p)
int c;
int len;
- if (*p < 0x80) /* be quick for ASCII */
+ if (*p < 0x80) { // be quick for ASCII
return 0;
+ }
/* Skip backwards over trailing bytes: 10xx.xxxx
* Skip backwards again if on a composing char. */
const char_u *q;
for (q = p;; --q) {
- /* Move s to the last byte of this char. */
+ // Move s to the last byte of this char.
const char_u *s;
for (s = q; (s[1] & 0xc0) == 0x80; ++s) {}
- /* Move q to the first byte of this char. */
- while (q > base && (*q & 0xc0) == 0x80)
+ // Move q to the first byte of this char.
+ while (q > base && (*q & 0xc0) == 0x80) {
--q;
+ }
/* Check for illegal sequence. Do allow an illegal byte after where we
* started. */
len = utf8len_tab[*q];
- if (len != (int)(s - q + 1) && len != (int)(p - q + 1))
+ if (len != (int)(s - q + 1) && len != (int)(p - q + 1)) {
return 0;
+ }
- if (q <= base)
+ if (q <= base) {
break;
+ }
c = utf_ptr2char(q);
- if (utf_iscomposing(c))
+ if (utf_iscomposing(c)) {
continue;
+ }
if (arabic_maycombine(c)) {
- /* Advance to get a sneak-peak at the next char */
+ // Advance to get a sneak-peak at the next char
const char_u *j = q;
--j;
- /* Move j to the first byte of this char. */
- while (j > base && (*j & 0xc0) == 0x80)
+ // Move j to the first byte of this char.
+ while (j > base && (*j & 0xc0) == 0x80) {
--j;
- if (arabic_combine(utf_ptr2char(j), c))
+ }
+ if (arabic_combine(utf_ptr2char(j), c)) {
continue;
+ }
}
break;
}
@@ -1627,12 +1676,12 @@ bool utf_eat_space(int cc)
FUNC_ATTR_CONST FUNC_ATTR_WARN_UNUSED_RESULT
{
return (cc >= 0x2000 && cc <= 0x206F) // General punctuations
- || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations
- || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations
- || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations
- || (cc >= 0xff1a && cc <= 0xff20) // ..
- || (cc >= 0xff3b && cc <= 0xff40) // ..
- || (cc >= 0xff5b && cc <= 0xff65); // ..
+ || (cc >= 0x2e00 && cc <= 0x2e7f) // Supplemental punctuations
+ || (cc >= 0x3000 && cc <= 0x303f) // CJK symbols and punctuations
+ || (cc >= 0xff01 && cc <= 0xff0f) // Full width ASCII punctuations
+ || (cc >= 0xff1a && cc <= 0xff20) // ..
+ || (cc >= 0xff3b && cc <= 0xff40) // ..
+ || (cc >= 0xff5b && cc <= 0xff65); // ..
}
// Whether line break is allowed before "cc".
@@ -1814,8 +1863,9 @@ int mb_tail_off(char_u *base, char_u *p)
int i;
int j;
- if (*p == NUL)
+ if (*p == NUL) {
return 0;
+ }
// Find the last character that is 10xx.xxxx
for (i = 0; (p[i + 1] & 0xc0) == 0x80; i++) {}
@@ -1839,10 +1889,10 @@ int mb_tail_off(char_u *base, char_u *p)
void utf_find_illegal(void)
{
pos_T pos = curwin->w_cursor;
- char_u *p;
+ char_u *p;
int len;
vimconv_T vimconv;
- char_u *tofree = NULL;
+ char_u *tofree = NULL;
vimconv.vc_type = CONV_NONE;
if (enc_canon_props(curbuf->b_p_fenc) & ENC_8BIT) {
@@ -1858,8 +1908,9 @@ void utf_find_illegal(void)
if (vimconv.vc_type != CONV_NONE) {
xfree(tofree);
tofree = string_convert(&vimconv, p, NULL);
- if (tofree == NULL)
+ if (tofree == NULL) {
break;
+ }
p = tofree;
}
@@ -1868,10 +1919,10 @@ void utf_find_illegal(void)
* utf_ptr2len()) or too many of them (overlong sequence). */
len = utf_ptr2len(p);
if (*p >= 0x80 && (len == 1
- || utf_char2len(utf_ptr2char(p)) != len)) {
- if (vimconv.vc_type == CONV_NONE)
+ || utf_char2len(utf_ptr2char(p)) != len)) {
+ if (vimconv.vc_type == CONV_NONE) {
curwin->w_cursor.col += (colnr_T)(p - get_cursor_pos_ptr());
- else {
+ } else {
int l;
len = (int)(p - tofree);
@@ -1884,13 +1935,14 @@ void utf_find_illegal(void)
}
p += len;
}
- if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count)
+ if (curwin->w_cursor.lnum == curbuf->b_ml.ml_line_count) {
break;
+ }
++curwin->w_cursor.lnum;
curwin->w_cursor.col = 0;
}
- /* didn't find it: don't move and beep */
+ // didn't find it: don't move and beep
curwin->w_cursor = pos;
beep_flush();
@@ -1944,13 +1996,10 @@ void mb_check_adjust_col(void *win_)
}
}
-/*
- * Return a pointer to the character before "*p", if there is one.
- */
-char_u * mb_prevptr(
- char_u *line, /* start of the string */
- char_u *p
- )
+/// @param line start of the string
+///
+/// @return a pointer to the character before "*p", if there is one.
+char_u * mb_prevptr(char_u *line, char_u *p)
{
if (p > line) {
MB_PTR_BACK(line, p);
@@ -1964,14 +2013,16 @@ char_u * mb_prevptr(
*/
int mb_charlen(char_u *str)
{
- char_u *p = str;
+ char_u *p = str;
int count;
- if (p == NULL)
+ if (p == NULL) {
return 0;
+ }
- for (count = 0; *p != NUL; count++)
+ for (count = 0; *p != NUL; count++) {
p += (*mb_ptr2len)(p);
+ }
return count;
}
@@ -1981,11 +2032,12 @@ int mb_charlen(char_u *str)
*/
int mb_charlen_len(char_u *str, int len)
{
- char_u *p = str;
+ char_u *p = str;
int count;
- for (count = 0; *p != NUL && p < str + len; count++)
+ for (count = 0; *p != NUL && p < str + len; count++) {
p += (*mb_ptr2len)(p);
+ }
return count;
}
@@ -2049,10 +2101,12 @@ const char *mb_unescape(const char **const pp)
*/
char_u * enc_skip(char_u *p)
{
- if (STRNCMP(p, "2byte-", 6) == 0)
+ if (STRNCMP(p, "2byte-", 6) == 0) {
return p + 6;
- if (STRNCMP(p, "8bit-", 5) == 0)
+ }
+ if (STRNCMP(p, "8bit-", 5) == 0) {
return p + 5;
+ }
return p;
}
@@ -2064,7 +2118,7 @@ char_u * enc_skip(char_u *p)
*/
char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET
{
- char_u *p, *s;
+ char_u *p, *s;
int i;
if (STRCMP(enc, "default") == 0) {
@@ -2072,47 +2126,51 @@ char_u *enc_canonize(char_u *enc) FUNC_ATTR_NONNULL_RET
return vim_strsave(fenc_default);
}
- /* copy "enc" to allocated memory, with room for two '-' */
+ // copy "enc" to allocated memory, with room for two '-'
char_u *r = xmalloc(STRLEN(enc) + 3);
- /* Make it all lower case and replace '_' with '-'. */
+ // Make it all lower case and replace '_' with '-'.
p = r;
for (s = enc; *s != NUL; ++s) {
- if (*s == '_')
+ if (*s == '_') {
*p++ = '-';
- else
+ } else {
*p++ = TOLOWER_ASC(*s);
+ }
}
*p = NUL;
- /* Skip "2byte-" and "8bit-". */
+ // Skip "2byte-" and "8bit-".
p = enc_skip(r);
- /* Change "microsoft-cp" to "cp". Used in some spell files. */
- if (STRNCMP(p, "microsoft-cp", 12) == 0)
+ // Change "microsoft-cp" to "cp". Used in some spell files.
+ if (STRNCMP(p, "microsoft-cp", 12) == 0) {
STRMOVE(p, p + 10);
+ }
- /* "iso8859" -> "iso-8859" */
+ // "iso8859" -> "iso-8859"
if (STRNCMP(p, "iso8859", 7) == 0) {
STRMOVE(p + 4, p + 3);
p[3] = '-';
}
- /* "iso-8859n" -> "iso-8859-n" */
+ // "iso-8859n" -> "iso-8859-n"
if (STRNCMP(p, "iso-8859", 8) == 0 && p[8] != '-') {
STRMOVE(p + 9, p + 8);
p[8] = '-';
}
- /* "latin-N" -> "latinN" */
- if (STRNCMP(p, "latin-", 6) == 0)
+ // "latin-N" -> "latinN"
+ if (STRNCMP(p, "latin-", 6) == 0) {
STRMOVE(p + 5, p + 6);
+ }
if (enc_canon_search(p) >= 0) {
- /* canonical name can be used unmodified */
- if (p != r)
+ // canonical name can be used unmodified
+ if (p != r) {
STRMOVE(r, p);
+ }
} else if ((i = enc_alias_search(p)) >= 0) {
- /* alias recognized, get canonical name */
+ // alias recognized, get canonical name
xfree(r);
r = vim_strsave((char_u *)enc_canon_table[i].name);
}
@@ -2127,9 +2185,11 @@ static int enc_alias_search(char_u *name)
{
int i;
- for (i = 0; enc_alias_table[i].name != NULL; ++i)
- if (STRCMP(name, enc_alias_table[i].name) == 0)
+ for (i = 0; enc_alias_table[i].name != NULL; ++i) {
+ if (STRCMP(name, enc_alias_table[i].name) == 0) {
return enc_alias_table[i].canon;
+ }
+ }
return -1;
}
@@ -2219,13 +2279,13 @@ void * my_iconv_open(char_u *to, char_u *from)
iconv_t fd;
#define ICONV_TESTLEN 400
char_u tobuf[ICONV_TESTLEN];
- char *p;
+ char *p;
size_t tolen;
static WorkingStatus iconv_working = kUnknown;
- if (iconv_working == kBroken)
- return (void *)-1; /* detected a broken iconv() previously */
-
+ if (iconv_working == kBroken) {
+ return (void *)-1; // detected a broken iconv() previously
+ }
fd = iconv_open((char *)enc_skip(to), (char *)enc_skip(from));
if (fd != (iconv_t)-1 && iconv_working == kUnknown) {
@@ -2243,8 +2303,9 @@ void * my_iconv_open(char_u *to, char_u *from)
iconv_working = kBroken;
iconv_close(fd);
fd = (iconv_t)-1;
- } else
+ } else {
iconv_working = kWorking;
+ }
}
return (void *)fd;
@@ -2257,17 +2318,17 @@ void * my_iconv_open(char_u *to, char_u *from)
* Returns the converted string in allocated memory. NULL for an error.
* If resultlenp is not NULL, sets it to the result length in bytes.
*/
-static char_u *iconv_string(const vimconv_T *const vcp, char_u *str,
- size_t slen, size_t *unconvlenp, size_t *resultlenp)
+static char_u *iconv_string(const vimconv_T *const vcp, char_u *str, size_t slen,
+ size_t *unconvlenp, size_t *resultlenp)
{
- const char *from;
+ const char *from;
size_t fromlen;
- char *to;
+ char *to;
size_t tolen;
size_t len = 0;
size_t done = 0;
- char_u *result = NULL;
- char_u *p;
+ char_u *result = NULL;
+ char_u *p;
int l;
from = (char *)str;
@@ -2278,8 +2339,9 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str,
* increase the buffer size. */
len = len + fromlen * 2 + 40;
p = xmalloc(len);
- if (done > 0)
+ if (done > 0) {
memmove(p, result, done);
+ }
xfree(result);
result = p;
}
@@ -2327,8 +2389,9 @@ static char_u *iconv_string(const vimconv_T *const vcp, char_u *str,
done = to - (char *)result;
}
- if (resultlenp != NULL && result != NULL)
+ if (resultlenp != NULL && result != NULL) {
*resultlenp = (size_t)(to - (char *)result);
+ }
return result;
}
@@ -2353,8 +2416,8 @@ int convert_setup(vimconv_T *vcp, char_u *from, char_u *to)
/// As convert_setup(), but only when from_unicode_is_utf8 is true will all
/// "from" unicode charsets be considered utf-8. Same for "to".
-int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8,
- char_u *to, bool to_unicode_is_utf8)
+int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8, char_u *to,
+ bool to_unicode_is_utf8)
{
int from_prop;
int to_prop;
@@ -2369,51 +2432,54 @@ int convert_setup_ext(vimconv_T *vcp, char_u *from, bool from_unicode_is_utf8,
# endif
*vcp = (vimconv_T)MBYTE_NONE_CONV;
- /* No conversion when one of the names is empty or they are equal. */
+ // No conversion when one of the names is empty or they are equal.
if (from == NULL || *from == NUL || to == NULL || *to == NUL
- || STRCMP(from, to) == 0)
+ || STRCMP(from, to) == 0) {
return OK;
+ }
from_prop = enc_canon_props(from);
to_prop = enc_canon_props(to);
- if (from_unicode_is_utf8)
+ if (from_unicode_is_utf8) {
from_is_utf8 = from_prop & ENC_UNICODE;
- else
+ } else {
from_is_utf8 = from_prop == ENC_UNICODE;
- if (to_unicode_is_utf8)
+ }
+ if (to_unicode_is_utf8) {
to_is_utf8 = to_prop & ENC_UNICODE;
- else
+ } else {
to_is_utf8 = to_prop == ENC_UNICODE;
+ }
if ((from_prop & ENC_LATIN1) && to_is_utf8) {
- /* Internal latin1 -> utf-8 conversion. */
+ // Internal latin1 -> utf-8 conversion.
vcp->vc_type = CONV_TO_UTF8;
- vcp->vc_factor = 2; /* up to twice as long */
+ vcp->vc_factor = 2; // up to twice as long
} else if ((from_prop & ENC_LATIN9) && to_is_utf8) {
- /* Internal latin9 -> utf-8 conversion. */
+ // Internal latin9 -> utf-8 conversion.
vcp->vc_type = CONV_9_TO_UTF8;
- vcp->vc_factor = 3; /* up to three as long (euro sign) */
+ vcp->vc_factor = 3; // up to three as long (euro sign)
} else if (from_is_utf8 && (to_prop & ENC_LATIN1)) {
- /* Internal utf-8 -> latin1 conversion. */
+ // Internal utf-8 -> latin1 conversion.
vcp->vc_type = CONV_TO_LATIN1;
} else if (from_is_utf8 && (to_prop & ENC_LATIN9)) {
- /* Internal utf-8 -> latin9 conversion. */
+ // Internal utf-8 -> latin9 conversion.
vcp->vc_type = CONV_TO_LATIN9;
}
# ifdef HAVE_ICONV
else { // NOLINT(readability/braces)
// Use iconv() for conversion.
- vcp->vc_fd = (iconv_t)my_iconv_open(
- to_is_utf8 ? (char_u *)"utf-8" : to,
- from_is_utf8 ? (char_u *)"utf-8" : from);
+ vcp->vc_fd = (iconv_t)my_iconv_open(to_is_utf8 ? (char_u *)"utf-8" : to,
+ from_is_utf8 ? (char_u *)"utf-8" : from);
if (vcp->vc_fd != (iconv_t)-1) {
vcp->vc_type = CONV_ICONV;
- vcp->vc_factor = 4; /* could be longer too... */
+ vcp->vc_factor = 4; // could be longer too...
}
}
# endif
- if (vcp->vc_type == CONV_NONE)
+ if (vcp->vc_type == CONV_NONE) {
return FAIL;
+ }
return OK;
}
@@ -2435,129 +2501,153 @@ char_u *string_convert(const vimconv_T *const vcp, char_u *ptr, size_t *lenp)
* an incomplete sequence at the end it is not converted and "*unconvlenp" is
* set to the number of remaining bytes.
*/
-char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr,
- size_t *lenp, size_t *unconvlenp)
+char_u * string_convert_ext(const vimconv_T *const vcp, char_u *ptr, size_t *lenp,
+ size_t *unconvlenp)
{
- char_u *retval = NULL;
- char_u *d;
+ char_u *retval = NULL;
+ char_u *d;
int l;
int c;
size_t len;
- if (lenp == NULL)
+ if (lenp == NULL) {
len = STRLEN(ptr);
- else
+ } else {
len = *lenp;
- if (len == 0)
+ }
+ if (len == 0) {
return vim_strsave((char_u *)"");
+ }
switch (vcp->vc_type) {
- case CONV_TO_UTF8: /* latin1 to utf-8 conversion */
- retval = xmalloc(len * 2 + 1);
- d = retval;
- for (size_t i = 0; i < len; ++i) {
- c = ptr[i];
- if (c < 0x80)
- *d++ = c;
- else {
- *d++ = 0xc0 + ((unsigned)c >> 6);
- *d++ = 0x80 + (c & 0x3f);
- }
+ case CONV_TO_UTF8: // latin1 to utf-8 conversion
+ retval = xmalloc(len * 2 + 1);
+ d = retval;
+ for (size_t i = 0; i < len; ++i) {
+ c = ptr[i];
+ if (c < 0x80) {
+ *d++ = c;
+ } else {
+ *d++ = 0xc0 + ((unsigned)c >> 6);
+ *d++ = 0x80 + (c & 0x3f);
}
- *d = NUL;
- if (lenp != NULL)
- *lenp = (size_t)(d - retval);
- break;
+ }
+ *d = NUL;
+ if (lenp != NULL) {
+ *lenp = (size_t)(d - retval);
+ }
+ break;
- case CONV_9_TO_UTF8: /* latin9 to utf-8 conversion */
- retval = xmalloc(len * 3 + 1);
- d = retval;
- for (size_t i = 0; i < len; ++i) {
- c = ptr[i];
- switch (c) {
- case 0xa4: c = 0x20ac; break; /* euro */
- case 0xa6: c = 0x0160; break; /* S hat */
- case 0xa8: c = 0x0161; break; /* S -hat */
- case 0xb4: c = 0x017d; break; /* Z hat */
- case 0xb8: c = 0x017e; break; /* Z -hat */
- case 0xbc: c = 0x0152; break; /* OE */
- case 0xbd: c = 0x0153; break; /* oe */
- case 0xbe: c = 0x0178; break; /* Y */
- }
- d += utf_char2bytes(c, d);
+ case CONV_9_TO_UTF8: // latin9 to utf-8 conversion
+ retval = xmalloc(len * 3 + 1);
+ d = retval;
+ for (size_t i = 0; i < len; ++i) {
+ c = ptr[i];
+ switch (c) {
+ case 0xa4:
+ c = 0x20ac; break; // euro
+ case 0xa6:
+ c = 0x0160; break; // S hat
+ case 0xa8:
+ c = 0x0161; break; // S -hat
+ case 0xb4:
+ c = 0x017d; break; // Z hat
+ case 0xb8:
+ c = 0x017e; break; // Z -hat
+ case 0xbc:
+ c = 0x0152; break; // OE
+ case 0xbd:
+ c = 0x0153; break; // oe
+ case 0xbe:
+ c = 0x0178; break; // Y
}
- *d = NUL;
- if (lenp != NULL)
- *lenp = (size_t)(d - retval);
- break;
+ d += utf_char2bytes(c, d);
+ }
+ *d = NUL;
+ if (lenp != NULL) {
+ *lenp = (size_t)(d - retval);
+ }
+ break;
- case CONV_TO_LATIN1: /* utf-8 to latin1 conversion */
- case CONV_TO_LATIN9: /* utf-8 to latin9 conversion */
- retval = xmalloc(len + 1);
- d = retval;
- for (size_t i = 0; i < len; ++i) {
- l = utf_ptr2len_len(ptr + i, len - i);
- if (l == 0)
- *d++ = NUL;
- else if (l == 1) {
- uint8_t l_w = utf8len_tab_zero[ptr[i]];
-
- if (l_w == 0) {
- /* Illegal utf-8 byte cannot be converted */
+ case CONV_TO_LATIN1: // utf-8 to latin1 conversion
+ case CONV_TO_LATIN9: // utf-8 to latin9 conversion
+ retval = xmalloc(len + 1);
+ d = retval;
+ for (size_t i = 0; i < len; ++i) {
+ l = utf_ptr2len_len(ptr + i, len - i);
+ if (l == 0) {
+ *d++ = NUL;
+ } else if (l == 1) {
+ uint8_t l_w = utf8len_tab_zero[ptr[i]];
+
+ if (l_w == 0) {
+ // Illegal utf-8 byte cannot be converted
+ xfree(retval);
+ return NULL;
+ }
+ if (unconvlenp != NULL && l_w > len - i) {
+ // Incomplete sequence at the end.
+ *unconvlenp = len - i;
+ break;
+ }
+ *d++ = ptr[i];
+ } else {
+ c = utf_ptr2char(ptr + i);
+ if (vcp->vc_type == CONV_TO_LATIN9) {
+ switch (c) {
+ case 0x20ac:
+ c = 0xa4; break; // euro
+ case 0x0160:
+ c = 0xa6; break; // S hat
+ case 0x0161:
+ c = 0xa8; break; // S -hat
+ case 0x017d:
+ c = 0xb4; break; // Z hat
+ case 0x017e:
+ c = 0xb8; break; // Z -hat
+ case 0x0152:
+ c = 0xbc; break; // OE
+ case 0x0153:
+ c = 0xbd; break; // oe
+ case 0x0178:
+ c = 0xbe; break; // Y
+ case 0xa4:
+ case 0xa6:
+ case 0xa8:
+ case 0xb4:
+ case 0xb8:
+ case 0xbc:
+ case 0xbd:
+ case 0xbe:
+ c = 0x100; break; // not in latin9
+ }
+ }
+ if (!utf_iscomposing(c)) { // skip composing chars
+ if (c < 0x100) {
+ *d++ = c;
+ } else if (vcp->vc_fail) {
xfree(retval);
return NULL;
- }
- if (unconvlenp != NULL && l_w > len - i) {
- /* Incomplete sequence at the end. */
- *unconvlenp = len - i;
- break;
- }
- *d++ = ptr[i];
- } else {
- c = utf_ptr2char(ptr + i);
- if (vcp->vc_type == CONV_TO_LATIN9)
- switch (c) {
- case 0x20ac: c = 0xa4; break; /* euro */
- case 0x0160: c = 0xa6; break; /* S hat */
- case 0x0161: c = 0xa8; break; /* S -hat */
- case 0x017d: c = 0xb4; break; /* Z hat */
- case 0x017e: c = 0xb8; break; /* Z -hat */
- case 0x0152: c = 0xbc; break; /* OE */
- case 0x0153: c = 0xbd; break; /* oe */
- case 0x0178: c = 0xbe; break; /* Y */
- case 0xa4:
- case 0xa6:
- case 0xa8:
- case 0xb4:
- case 0xb8:
- case 0xbc:
- case 0xbd:
- case 0xbe: c = 0x100; break; /* not in latin9 */
- }
- if (!utf_iscomposing(c)) { /* skip composing chars */
- if (c < 0x100)
- *d++ = c;
- else if (vcp->vc_fail) {
- xfree(retval);
- return NULL;
- } else {
- *d++ = 0xbf;
- if (utf_char2cells(c) > 1)
- *d++ = '?';
+ } else {
+ *d++ = 0xbf;
+ if (utf_char2cells(c) > 1) {
+ *d++ = '?';
}
}
- i += l - 1;
}
+ i += l - 1;
}
- *d = NUL;
- if (lenp != NULL)
- *lenp = (size_t)(d - retval);
- break;
+ }
+ *d = NUL;
+ if (lenp != NULL) {
+ *lenp = (size_t)(d - retval);
+ }
+ break;
# ifdef HAVE_ICONV
- case CONV_ICONV: // conversion with vcp->vc_fd
- retval = iconv_string(vcp, ptr, len, unconvlenp, lenp);
- break;
+ case CONV_ICONV: // conversion with vcp->vc_fd
+ retval = iconv_string(vcp, ptr, len, unconvlenp, lenp);
+ break;
# endif
}