diff options
author | zeertzjq <zeertzjq@outlook.com> | 2022-07-10 06:01:49 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-07-10 06:01:49 +0800 |
commit | d6a1e718813f744b997e1b8fc707cbd47125db5c (patch) | |
tree | f66d31a64479a248cde583f11a593d8c199c6057 | |
parent | 782f7261363f7242cf7472e64434604915fa3075 (diff) | |
download | rneovim-d6a1e718813f744b997e1b8fc707cbd47125db5c.tar.gz rneovim-d6a1e718813f744b997e1b8fc707cbd47125db5c.tar.bz2 rneovim-d6a1e718813f744b997e1b8fc707cbd47125db5c.zip |
vim-patch:8.1.1038: Arabic support excludes Farsi (#19285)
Problem: Arabic support excludes Farsi.
Solution: Add Farsi support to the Arabic support. (Ali Gholami Rudi,
Ameretat Reith)
https://github.com/vim/vim/commit/dc4fa190e7b9d6ba49416ce875d2192c4444d3eb
Omit Test_shape_final_to_medial(): removed in later patches.
-rw-r--r-- | src/nvim/arabic.c | 1066 | ||||
-rw-r--r-- | src/nvim/arabic.h | 7 | ||||
-rw-r--r-- | src/nvim/ex_getln.c | 4 | ||||
-rw-r--r-- | src/nvim/grid.c | 2 | ||||
-rw-r--r-- | src/nvim/screen.c | 4 |
5 files changed, 189 insertions, 894 deletions
diff --git a/src/nvim/arabic.c b/src/nvim/arabic.c index 130ce65b86..06536e6e2b 100644 --- a/src/nvim/arabic.c +++ b/src/nvim/arabic.c @@ -5,6 +5,13 @@ /// /// Functions for Arabic language. /// +/// Author: Nadim Shaikli & Isam Bayazidi +/// Farsi support and restructuring to make adding new letters easier by Ali +/// Gholami Rudi. Further work by Ameretat Reith. + +/// Sorted list of unicode Arabic characters. Each entry holds the +/// presentation forms of a letter. +/// /// Arabic characters are categorized into following types: /// /// Isolated - iso-8859-6 form char denoted with a_* @@ -19,12 +26,7 @@ #include "nvim/ascii.h" #include "nvim/vim.h" -// Arabic ISO-10646-1 character set definition - -// Arabic ISO-8859-6 (subset of 10646; 0600 - 06FF) -#define a_COMMA 0x060C -#define a_SEMICOLON 0x061B -#define a_QUESTION 0x061F +// Unicode values for Arabic characters. #define a_HAMZA 0x0621 #define a_ALEF_MADDA 0x0622 #define a_ALEF_HAMZA_ABOVE 0x0623 @@ -62,7 +64,6 @@ #define a_WAW 0x0648 #define a_ALEF_MAKSURA 0x0649 #define a_YEH 0x064a - #define a_FATHATAN 0x064b #define a_DAMMATAN 0x064c #define a_KASRATAN 0x064d @@ -71,168 +72,17 @@ #define a_KASRA 0x0650 #define a_SHADDA 0x0651 #define a_SUKUN 0x0652 - #define a_MADDA_ABOVE 0x0653 #define a_HAMZA_ABOVE 0x0654 #define a_HAMZA_BELOW 0x0655 -#define a_ZERO 0x0660 -#define a_ONE 0x0661 -#define a_TWO 0x0662 -#define a_THREE 0x0663 -#define a_FOUR 0x0664 -#define a_FIVE 0x0665 -#define a_SIX 0x0666 -#define a_SEVEN 0x0667 -#define a_EIGHT 0x0668 -#define a_NINE 0x0669 -#define a_PERCENT 0x066a -#define a_DECIMAL 0x066b -#define a_THOUSANDS 0x066c -#define a_STAR 0x066d -#define a_MINI_ALEF 0x0670 -// Rest of 8859-6 does not relate to Arabic +#define a_PEH 0x067e +#define a_TCHEH 0x0686 +#define a_JEH 0x0698 +#define a_FKAF 0x06a9 +#define a_GAF 0x06af +#define a_FYEH 0x06cc -// Arabic Presentation Form-B (subset of 10646; FE70 - FEFF) -// -// s -> isolated -// i -> initial -// m -> medial -// f -> final -#define a_s_FATHATAN 0xfe70 -#define a_m_TATWEEL_FATHATAN 0xfe71 -#define a_s_DAMMATAN 0xfe72 - -#define a_s_KASRATAN 0xfe74 - -#define a_s_FATHA 0xfe76 -#define a_m_FATHA 0xfe77 -#define a_s_DAMMA 0xfe78 -#define a_m_DAMMA 0xfe79 -#define a_s_KASRA 0xfe7a -#define a_m_KASRA 0xfe7b -#define a_s_SHADDA 0xfe7c -#define a_m_SHADDA 0xfe7d -#define a_s_SUKUN 0xfe7e -#define a_m_SUKUN 0xfe7f - -#define a_s_HAMZA 0xfe80 -#define a_s_ALEF_MADDA 0xfe81 -#define a_f_ALEF_MADDA 0xfe82 -#define a_s_ALEF_HAMZA_ABOVE 0xfe83 -#define a_f_ALEF_HAMZA_ABOVE 0xfe84 -#define a_s_WAW_HAMZA 0xfe85 -#define a_f_WAW_HAMZA 0xfe86 -#define a_s_ALEF_HAMZA_BELOW 0xfe87 -#define a_f_ALEF_HAMZA_BELOW 0xfe88 -#define a_s_YEH_HAMZA 0xfe89 -#define a_f_YEH_HAMZA 0xfe8a -#define a_i_YEH_HAMZA 0xfe8b -#define a_m_YEH_HAMZA 0xfe8c -#define a_s_ALEF 0xfe8d -#define a_f_ALEF 0xfe8e -#define a_s_BEH 0xfe8f -#define a_f_BEH 0xfe90 -#define a_i_BEH 0xfe91 -#define a_m_BEH 0xfe92 -#define a_s_TEH_MARBUTA 0xfe93 -#define a_f_TEH_MARBUTA 0xfe94 -#define a_s_TEH 0xfe95 -#define a_f_TEH 0xfe96 -#define a_i_TEH 0xfe97 -#define a_m_TEH 0xfe98 -#define a_s_THEH 0xfe99 -#define a_f_THEH 0xfe9a -#define a_i_THEH 0xfe9b -#define a_m_THEH 0xfe9c -#define a_s_JEEM 0xfe9d -#define a_f_JEEM 0xfe9e -#define a_i_JEEM 0xfe9f -#define a_m_JEEM 0xfea0 -#define a_s_HAH 0xfea1 -#define a_f_HAH 0xfea2 -#define a_i_HAH 0xfea3 -#define a_m_HAH 0xfea4 -#define a_s_KHAH 0xfea5 -#define a_f_KHAH 0xfea6 -#define a_i_KHAH 0xfea7 -#define a_m_KHAH 0xfea8 -#define a_s_DAL 0xfea9 -#define a_f_DAL 0xfeaa -#define a_s_THAL 0xfeab -#define a_f_THAL 0xfeac -#define a_s_REH 0xfead -#define a_f_REH 0xfeae -#define a_s_ZAIN 0xfeaf -#define a_f_ZAIN 0xfeb0 -#define a_s_SEEN 0xfeb1 -#define a_f_SEEN 0xfeb2 -#define a_i_SEEN 0xfeb3 -#define a_m_SEEN 0xfeb4 -#define a_s_SHEEN 0xfeb5 -#define a_f_SHEEN 0xfeb6 -#define a_i_SHEEN 0xfeb7 -#define a_m_SHEEN 0xfeb8 -#define a_s_SAD 0xfeb9 -#define a_f_SAD 0xfeba -#define a_i_SAD 0xfebb -#define a_m_SAD 0xfebc -#define a_s_DAD 0xfebd -#define a_f_DAD 0xfebe -#define a_i_DAD 0xfebf -#define a_m_DAD 0xfec0 -#define a_s_TAH 0xfec1 -#define a_f_TAH 0xfec2 -#define a_i_TAH 0xfec3 -#define a_m_TAH 0xfec4 -#define a_s_ZAH 0xfec5 -#define a_f_ZAH 0xfec6 -#define a_i_ZAH 0xfec7 -#define a_m_ZAH 0xfec8 -#define a_s_AIN 0xfec9 -#define a_f_AIN 0xfeca -#define a_i_AIN 0xfecb -#define a_m_AIN 0xfecc -#define a_s_GHAIN 0xfecd -#define a_f_GHAIN 0xfece -#define a_i_GHAIN 0xfecf -#define a_m_GHAIN 0xfed0 -#define a_s_FEH 0xfed1 -#define a_f_FEH 0xfed2 -#define a_i_FEH 0xfed3 -#define a_m_FEH 0xfed4 -#define a_s_QAF 0xfed5 -#define a_f_QAF 0xfed6 -#define a_i_QAF 0xfed7 -#define a_m_QAF 0xfed8 -#define a_s_KAF 0xfed9 -#define a_f_KAF 0xfeda -#define a_i_KAF 0xfedb -#define a_m_KAF 0xfedc -#define a_s_LAM 0xfedd -#define a_f_LAM 0xfede -#define a_i_LAM 0xfedf -#define a_m_LAM 0xfee0 -#define a_s_MEEM 0xfee1 -#define a_f_MEEM 0xfee2 -#define a_i_MEEM 0xfee3 -#define a_m_MEEM 0xfee4 -#define a_s_NOON 0xfee5 -#define a_f_NOON 0xfee6 -#define a_i_NOON 0xfee7 -#define a_m_NOON 0xfee8 -#define a_s_HEH 0xfee9 -#define a_f_HEH 0xfeea -#define a_i_HEH 0xfeeb -#define a_m_HEH 0xfeec -#define a_s_WAW 0xfeed -#define a_f_WAW 0xfeee -#define a_s_ALEF_MAKSURA 0xfeef -#define a_f_ALEF_MAKSURA 0xfef0 -#define a_s_YEH 0xfef1 -#define a_f_YEH 0xfef2 -#define a_i_YEH 0xfef3 -#define a_m_YEH 0xfef4 #define a_s_LAM_ALEF_MADDA_ABOVE 0xfef5 #define a_f_LAM_ALEF_MADDA_ABOVE 0xfef6 #define a_s_LAM_ALEF_HAMZA_ABOVE 0xfef7 @@ -242,664 +92,201 @@ #define a_s_LAM_ALEF 0xfefb #define a_f_LAM_ALEF 0xfefc +static struct achar { + unsigned c; + unsigned isolated; + unsigned initial; + unsigned medial; + unsigned final; +} achars[] = { + { a_HAMZA, 0xfe80, 0, 0, 0 }, + { a_ALEF_MADDA, 0xfe81, 0, 0, 0xfe82 }, + { a_ALEF_HAMZA_ABOVE, 0xfe83, 0, 0, 0xfe84 }, + { a_WAW_HAMZA, 0xfe85, 0, 0, 0xfe86 }, + { a_ALEF_HAMZA_BELOW, 0xfe87, 0, 0, 0xfe88 }, + { a_YEH_HAMZA, 0xfe89, 0xfe8b, 0xfe8c, 0xfe8a }, + { a_ALEF, 0xfe8d, 0, 0, 0xfe8e }, + { a_BEH, 0xfe8f, 0xfe91, 0xfe92, 0xfe90 }, + { a_TEH_MARBUTA, 0xfe93, 0, 0, 0xfe94 }, + { a_TEH, 0xfe95, 0xfe97, 0xfe98, 0xfe96 }, + { a_THEH, 0xfe99, 0xfe9b, 0xfe9c, 0xfe9a }, + { a_JEEM, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e }, + { a_HAH, 0xfea1, 0xfea3, 0xfea4, 0xfea2 }, + { a_KHAH, 0xfea5, 0xfea7, 0xfea8, 0xfea6 }, + { a_DAL, 0xfea9, 0, 0, 0xfeaa }, + { a_THAL, 0xfeab, 0, 0, 0xfeac }, + { a_REH, 0xfead, 0, 0, 0xfeae }, + { a_ZAIN, 0xfeaf, 0, 0, 0xfeb0 }, + { a_SEEN, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2 }, + { a_SHEEN, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6 }, + { a_SAD, 0xfeb9, 0xfebb, 0xfebc, 0xfeba }, + { a_DAD, 0xfebd, 0xfebf, 0xfec0, 0xfebe }, + { a_TAH, 0xfec1, 0xfec3, 0xfec4, 0xfec2 }, + { a_ZAH, 0xfec5, 0xfec7, 0xfec8, 0xfec6 }, + { a_AIN, 0xfec9, 0xfecb, 0xfecc, 0xfeca }, + { a_GHAIN, 0xfecd, 0xfecf, 0xfed0, 0xfece }, + { a_TATWEEL, 0, 0x0640, 0x0640, 0x0640 }, + { a_FEH, 0xfed1, 0xfed3, 0xfed4, 0xfed2 }, + { a_QAF, 0xfed5, 0xfed7, 0xfed8, 0xfed6 }, + { a_KAF, 0xfed9, 0xfedb, 0xfedc, 0xfeda }, + { a_LAM, 0xfedd, 0xfedf, 0xfee0, 0xfede }, + { a_MEEM, 0xfee1, 0xfee3, 0xfee4, 0xfee2 }, + { a_NOON, 0xfee5, 0xfee7, 0xfee8, 0xfee6 }, + { a_HEH, 0xfee9, 0xfeeb, 0xfeec, 0xfeea }, + { a_WAW, 0xfeed, 0, 0, 0xfeee }, + { a_ALEF_MAKSURA, 0xfeef, 0, 0, 0xfef0 }, + { a_YEH, 0xfef1, 0xfef3, 0xfef4, 0xfef2 }, + { a_FATHATAN, 0xfe70, 0, 0, 0 }, + { a_DAMMATAN, 0xfe72, 0, 0, 0 }, + { a_KASRATAN, 0xfe74, 0, 0, 0 }, + { a_FATHA, 0xfe76, 0, 0xfe77, 0 }, + { a_DAMMA, 0xfe78, 0, 0xfe79, 0 }, + { a_KASRA, 0xfe7a, 0, 0xfe7b, 0 }, + { a_SHADDA, 0xfe7c, 0, 0xfe7c, 0 }, + { a_SUKUN, 0xfe7e, 0, 0xfe7f, 0 }, + { a_MADDA_ABOVE, 0, 0, 0, 0 }, + { a_HAMZA_ABOVE, 0, 0, 0, 0 }, + { a_HAMZA_BELOW, 0, 0, 0, 0 }, + { a_PEH, 0xfb56, 0xfb58, 0xfb59, 0xfb57 }, + { a_TCHEH, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b }, + { a_JEH, 0xfb8a, 0, 0, 0xfb8b }, + { a_FKAF, 0xfb8e, 0xfb90, 0xfb91, 0xfb8f }, + { a_GAF, 0xfb92, 0xfb94, 0xfb95, 0xfb93 }, + { a_FYEH, 0xfbfc, 0xfbfe, 0xfbff, 0xfbfd }, +}; + #define a_BYTE_ORDER_MARK 0xfeff #ifdef INCLUDE_GENERATED_DECLARATIONS # include "arabic.c.generated.h" #endif -// Returns true if c is an ISO-8859-6 shaped ARABIC letter (user entered). -static bool A_is_a(int cur_c) -{ - switch (cur_c) { - case a_HAMZA: - case a_ALEF_MADDA: - case a_ALEF_HAMZA_ABOVE: - case a_WAW_HAMZA: - case a_ALEF_HAMZA_BELOW: - case a_YEH_HAMZA: - case a_ALEF: - case a_BEH: - case a_TEH_MARBUTA: - case a_TEH: - case a_THEH: - case a_JEEM: - case a_HAH: - case a_KHAH: - case a_DAL: - case a_THAL: - case a_REH: - case a_ZAIN: - case a_SEEN: - case a_SHEEN: - case a_SAD: - case a_DAD: - case a_TAH: - case a_ZAH: - case a_AIN: - case a_GHAIN: - case a_TATWEEL: - case a_FEH: - case a_QAF: - case a_KAF: - case a_LAM: - case a_MEEM: - case a_NOON: - case a_HEH: - case a_WAW: - case a_ALEF_MAKSURA: - case a_YEH: - return true; - } - - return false; -} - -// Returns true if c is an Isolated Form-B ARABIC letter -static bool A_is_s(int cur_c) +/// Find the struct achar pointer to the given Arabic char. +/// Returns NULL if not found. +static struct achar *find_achar(int c) { - switch (cur_c) { - case a_s_HAMZA: - case a_s_ALEF_MADDA: - case a_s_ALEF_HAMZA_ABOVE: - case a_s_WAW_HAMZA: - case a_s_ALEF_HAMZA_BELOW: - case a_s_YEH_HAMZA: - case a_s_ALEF: - case a_s_BEH: - case a_s_TEH_MARBUTA: - case a_s_TEH: - case a_s_THEH: - case a_s_JEEM: - case a_s_HAH: - case a_s_KHAH: - case a_s_DAL: - case a_s_THAL: - case a_s_REH: - case a_s_ZAIN: - case a_s_SEEN: - case a_s_SHEEN: - case a_s_SAD: - case a_s_DAD: - case a_s_TAH: - case a_s_ZAH: - case a_s_AIN: - case a_s_GHAIN: - case a_s_FEH: - case a_s_QAF: - case a_s_KAF: - case a_s_LAM: - case a_s_MEEM: - case a_s_NOON: - case a_s_HEH: - case a_s_WAW: - case a_s_ALEF_MAKSURA: - case a_s_YEH: - return true; + // using binary search to find c + int h = ARRAY_SIZE(achars); + int l = 0; + while (l < h) { + int m = (h + l) / 2; + if (achars[m].c == (unsigned)c) { + return &achars[m]; + } + if ((unsigned)c < achars[m].c) { + h = m; + } else { + l = m + 1; + } } - - return false; + return NULL; } -// Returns true if c is a Final shape of an ARABIC letter -static bool A_is_f(int cur_c) +/// Change shape - from Combination (2 char) to an Isolated +static int chg_c_laa2i(int hid_c) { - switch (cur_c) { - case a_f_ALEF_MADDA: - case a_f_ALEF_HAMZA_ABOVE: - case a_f_WAW_HAMZA: - case a_f_ALEF_HAMZA_BELOW: - case a_f_YEH_HAMZA: - case a_f_ALEF: - case a_f_BEH: - case a_f_TEH_MARBUTA: - case a_f_TEH: - case a_f_THEH: - case a_f_JEEM: - case a_f_HAH: - case a_f_KHAH: - case a_f_DAL: - case a_f_THAL: - case a_f_REH: - case a_f_ZAIN: - case a_f_SEEN: - case a_f_SHEEN: - case a_f_SAD: - case a_f_DAD: - case a_f_TAH: - case a_f_ZAH: - case a_f_AIN: - case a_f_GHAIN: - case a_f_FEH: - case a_f_QAF: - case a_f_KAF: - case a_f_LAM: - case a_f_MEEM: - case a_f_NOON: - case a_f_HEH: - case a_f_WAW: - case a_f_ALEF_MAKSURA: - case a_f_YEH: - case a_f_LAM_ALEF_MADDA_ABOVE: - case a_f_LAM_ALEF_HAMZA_ABOVE: - case a_f_LAM_ALEF_HAMZA_BELOW: - case a_f_LAM_ALEF: - return true; - } - return false; -} + int tempc; -// Change shape - from ISO-8859-6/Isolated to Form-B Isolated -static int chg_c_a2s(int cur_c) -{ - switch (cur_c) { - case a_HAMZA: - return a_s_HAMZA; + switch (hid_c) { case a_ALEF_MADDA: - return a_s_ALEF_MADDA; + tempc = a_s_LAM_ALEF_MADDA_ABOVE; + break; case a_ALEF_HAMZA_ABOVE: - return a_s_ALEF_HAMZA_ABOVE; - case a_WAW_HAMZA: - return a_s_WAW_HAMZA; + tempc = a_s_LAM_ALEF_HAMZA_ABOVE; + break; case a_ALEF_HAMZA_BELOW: - return a_s_ALEF_HAMZA_BELOW; - case a_YEH_HAMZA: - return a_s_YEH_HAMZA; + tempc = a_s_LAM_ALEF_HAMZA_BELOW; + break; case a_ALEF: - return a_s_ALEF; - case a_TEH_MARBUTA: - return a_s_TEH_MARBUTA; - case a_DAL: - return a_s_DAL; - case a_THAL: - return a_s_THAL; - case a_REH: - return a_s_REH; - case a_ZAIN: - return a_s_ZAIN; - case a_TATWEEL: - return cur_c; // exceptions - case a_WAW: - return a_s_WAW; - case a_ALEF_MAKSURA: - return a_s_ALEF_MAKSURA; - case a_BEH: - return a_s_BEH; - case a_TEH: - return a_s_TEH; - case a_THEH: - return a_s_THEH; - case a_JEEM: - return a_s_JEEM; - case a_HAH: - return a_s_HAH; - case a_KHAH: - return a_s_KHAH; - case a_SEEN: - return a_s_SEEN; - case a_SHEEN: - return a_s_SHEEN; - case a_SAD: - return a_s_SAD; - case a_DAD: - return a_s_DAD; - case a_TAH: - return a_s_TAH; - case a_ZAH: - return a_s_ZAH; - case a_AIN: - return a_s_AIN; - case a_GHAIN: - return a_s_GHAIN; - case a_FEH: - return a_s_FEH; - case a_QAF: - return a_s_QAF; - case a_KAF: - return a_s_KAF; - case a_LAM: - return a_s_LAM; - case a_MEEM: - return a_s_MEEM; - case a_NOON: - return a_s_NOON; - case a_HEH: - return a_s_HEH; - case a_YEH: - return a_s_YEH; + tempc = a_s_LAM_ALEF; + break; + default: + tempc = 0; } - return 0; -} -// Change shape - from ISO-8859-6/Isolated to Initial -static int chg_c_a2i(int cur_c) -{ - switch (cur_c) { - case a_YEH_HAMZA: - return a_i_YEH_HAMZA; - case a_HAMZA: - return a_s_HAMZA; // exceptions - case a_ALEF_MADDA: - return a_s_ALEF_MADDA; // exceptions - case a_ALEF_HAMZA_ABOVE: - return a_s_ALEF_HAMZA_ABOVE; // exceptions - case a_WAW_HAMZA: - return a_s_WAW_HAMZA; // exceptions - case a_ALEF_HAMZA_BELOW: - return a_s_ALEF_HAMZA_BELOW; // exceptions - case a_ALEF: - return a_s_ALEF; // exceptions - case a_TEH_MARBUTA: - return a_s_TEH_MARBUTA; // exceptions - case a_DAL: - return a_s_DAL; // exceptions - case a_THAL: - return a_s_THAL; // exceptions - case a_REH: - return a_s_REH; // exceptions - case a_ZAIN: - return a_s_ZAIN; // exceptions - case a_TATWEEL: - return cur_c; // exceptions - case a_WAW: - return a_s_WAW; // exceptions - case a_ALEF_MAKSURA: - return a_s_ALEF_MAKSURA; // exceptions - case a_BEH: - return a_i_BEH; - case a_TEH: - return a_i_TEH; - case a_THEH: - return a_i_THEH; - case a_JEEM: - return a_i_JEEM; - case a_HAH: - return a_i_HAH; - case a_KHAH: - return a_i_KHAH; - case a_SEEN: - return a_i_SEEN; - case a_SHEEN: - return a_i_SHEEN; - case a_SAD: - return a_i_SAD; - case a_DAD: - return a_i_DAD; - case a_TAH: - return a_i_TAH; - case a_ZAH: - return a_i_ZAH; - case a_AIN: - return a_i_AIN; - case a_GHAIN: - return a_i_GHAIN; - case a_FEH: - return a_i_FEH; - case a_QAF: - return a_i_QAF; - case a_KAF: - return a_i_KAF; - case a_LAM: - return a_i_LAM; - case a_MEEM: - return a_i_MEEM; - case a_NOON: - return a_i_NOON; - case a_HEH: - return a_i_HEH; - case a_YEH: - return a_i_YEH; - } - return 0; + return tempc; } -// Change shape - from ISO-8859-6/Isolated to Medial -static int chg_c_a2m(int cur_c) +/// Change shape - from Combination-Isolated to Final +static int chg_c_laa2f(int hid_c) { - switch (cur_c) { - case a_HAMZA: - return a_s_HAMZA; // exception + int tempc; + + switch (hid_c) { case a_ALEF_MADDA: - return a_f_ALEF_MADDA; // exception + tempc = a_f_LAM_ALEF_MADDA_ABOVE; + break; case a_ALEF_HAMZA_ABOVE: - return a_f_ALEF_HAMZA_ABOVE; // exception - case a_WAW_HAMZA: - return a_f_WAW_HAMZA; // exception + tempc = a_f_LAM_ALEF_HAMZA_ABOVE; + break; case a_ALEF_HAMZA_BELOW: - return a_f_ALEF_HAMZA_BELOW; // exception - case a_YEH_HAMZA: - return a_m_YEH_HAMZA; + tempc = a_f_LAM_ALEF_HAMZA_BELOW; + break; case a_ALEF: - return a_f_ALEF; // exception - case a_BEH: - return a_m_BEH; - case a_TEH_MARBUTA: - return a_f_TEH_MARBUTA; // exception - case a_TEH: - return a_m_TEH; - case a_THEH: - return a_m_THEH; - case a_JEEM: - return a_m_JEEM; - case a_HAH: - return a_m_HAH; - case a_KHAH: - return a_m_KHAH; - case a_DAL: - return a_f_DAL; // exception - case a_THAL: - return a_f_THAL; // exception - case a_REH: - return a_f_REH; // exception - case a_ZAIN: - return a_f_ZAIN; // exception - case a_SEEN: - return a_m_SEEN; - case a_SHEEN: - return a_m_SHEEN; - case a_SAD: - return a_m_SAD; - case a_DAD: - return a_m_DAD; - case a_TAH: - return a_m_TAH; - case a_ZAH: - return a_m_ZAH; - case a_AIN: - return a_m_AIN; - case a_GHAIN: - return a_m_GHAIN; - case a_TATWEEL: - return cur_c; // exception - case a_FEH: - return a_m_FEH; - case a_QAF: - return a_m_QAF; - case a_KAF: - return a_m_KAF; - case a_LAM: - return a_m_LAM; - case a_MEEM: - return a_m_MEEM; - case a_NOON: - return a_m_NOON; - case a_HEH: - return a_m_HEH; - case a_WAW: - return a_f_WAW; // exception - case a_ALEF_MAKSURA: - return a_f_ALEF_MAKSURA; // exception - case a_YEH: - return a_m_YEH; + tempc = a_f_LAM_ALEF; + break; + default: + tempc = 0; } - return 0; + + return tempc; } -// Change shape - from ISO-8859-6/Isolated to final -static int chg_c_a2f(int cur_c) +/// Returns whether it is possible to join the given letters +static int can_join(int c1, int c2) { - // NOTE: these encodings need to be accounted for - // - // a_f_ALEF_MADDA; - // a_f_ALEF_HAMZA_ABOVE; - // a_f_ALEF_HAMZA_BELOW; - // a_f_LAM_ALEF_MADDA_ABOVE; - // a_f_LAM_ALEF_HAMZA_ABOVE; - // a_f_LAM_ALEF_HAMZA_BELOW; + struct achar *a1 = find_achar(c1); + struct achar *a2 = find_achar(c2); - switch (cur_c) { - case a_HAMZA: - return a_s_HAMZA; // exception - case a_ALEF_MADDA: - return a_f_ALEF_MADDA; - case a_ALEF_HAMZA_ABOVE: - return a_f_ALEF_HAMZA_ABOVE; - case a_WAW_HAMZA: - return a_f_WAW_HAMZA; - case a_ALEF_HAMZA_BELOW: - return a_f_ALEF_HAMZA_BELOW; - case a_YEH_HAMZA: - return a_f_YEH_HAMZA; - case a_ALEF: - return a_f_ALEF; - case a_BEH: - return a_f_BEH; - case a_TEH_MARBUTA: - return a_f_TEH_MARBUTA; - case a_TEH: - return a_f_TEH; - case a_THEH: - return a_f_THEH; - case a_JEEM: - return a_f_JEEM; - case a_HAH: - return a_f_HAH; - case a_KHAH: - return a_f_KHAH; - case a_DAL: - return a_f_DAL; - case a_THAL: - return a_f_THAL; - case a_REH: - return a_f_REH; - case a_ZAIN: - return a_f_ZAIN; - case a_SEEN: - return a_f_SEEN; - case a_SHEEN: - return a_f_SHEEN; - case a_SAD: - return a_f_SAD; - case a_DAD: - return a_f_DAD; - case a_TAH: - return a_f_TAH; - case a_ZAH: - return a_f_ZAH; - case a_AIN: - return a_f_AIN; - case a_GHAIN: - return a_f_GHAIN; - case a_TATWEEL: - return cur_c; // exception - case a_FEH: - return a_f_FEH; - case a_QAF: - return a_f_QAF; - case a_KAF: - return a_f_KAF; - case a_LAM: - return a_f_LAM; - case a_MEEM: - return a_f_MEEM; - case a_NOON: - return a_f_NOON; - case a_HEH: - return a_f_HEH; - case a_WAW: - return a_f_WAW; - case a_ALEF_MAKSURA: - return a_f_ALEF_MAKSURA; - case a_YEH: - return a_f_YEH; - } - return 0; + return a1 && a2 && (a1->initial || a1->medial) && (a2->final || a2->medial); } -// Change shape - from Initial to Medial -// This code is unreachable, because for the relevant characters ARABIC_CHAR() -// is FALSE; -#if 0 -static int chg_c_i2m(int cur_c) +/// Check whether we are dealing with a character that could be regarded as an +/// Arabic combining character, need to check the character before this. +bool arabic_maycombine(int two) + FUNC_ATTR_PURE { - switch (cur_c) { - case a_i_YEH_HAMZA: - return a_m_YEH_HAMZA; - case a_i_BEH: - return a_m_BEH; - case a_i_TEH: - return a_m_TEH; - case a_i_THEH: - return a_m_THEH; - case a_i_JEEM: - return a_m_JEEM; - case a_i_HAH: - return a_m_HAH; - case a_i_KHAH: - return a_m_KHAH; - case a_i_SEEN: - return a_m_SEEN; - case a_i_SHEEN: - return a_m_SHEEN; - case a_i_SAD: - return a_m_SAD; - case a_i_DAD: - return a_m_DAD; - case a_i_TAH: - return a_m_TAH; - case a_i_ZAH: - return a_m_ZAH; - case a_i_AIN: - return a_m_AIN; - case a_i_GHAIN: - return a_m_GHAIN; - case a_i_FEH: - return a_m_FEH; - case a_i_QAF: - return a_m_QAF; - case a_i_KAF: - return a_m_KAF; - case a_i_LAM: - return a_m_LAM; - case a_i_MEEM: - return a_m_MEEM; - case a_i_NOON: - return a_m_NOON; - case a_i_HEH: - return a_m_HEH; - case a_i_YEH: - return a_m_YEH; + if (p_arshape && !p_tbidi) { + return two == a_ALEF_MADDA + || two == a_ALEF_HAMZA_ABOVE + || two == a_ALEF_HAMZA_BELOW + || two == a_ALEF; } - return 0; + return false; } -#endif -// Change shape - from Final to Medial -static int chg_c_f2m(int cur_c) +/// Check whether we are dealing with Arabic combining characters. +/// Note: these are NOT really composing characters! +/// +/// @param one First character. +/// @param two Character just after "one". +bool arabic_combine(int one, int two) + FUNC_ATTR_PURE { - switch (cur_c) { - // NOTE: these encodings are multi-positional, no ? - // case a_f_ALEF_MADDA: - // case a_f_ALEF_HAMZA_ABOVE: - // case a_f_ALEF_HAMZA_BELOW: - case a_f_YEH_HAMZA: - return a_m_YEH_HAMZA; - case a_f_WAW_HAMZA: // exceptions - case a_f_ALEF: - case a_f_TEH_MARBUTA: - case a_f_DAL: - case a_f_THAL: - case a_f_REH: - case a_f_ZAIN: - case a_f_WAW: - case a_f_ALEF_MAKSURA: - return cur_c; - case a_f_BEH: - return a_m_BEH; - case a_f_TEH: - return a_m_TEH; - case a_f_THEH: - return a_m_THEH; - case a_f_JEEM: - return a_m_JEEM; - case a_f_HAH: - return a_m_HAH; - case a_f_KHAH: - return a_m_KHAH; - case a_f_SEEN: - return a_m_SEEN; - case a_f_SHEEN: - return a_m_SHEEN; - case a_f_SAD: - return a_m_SAD; - case a_f_DAD: - return a_m_DAD; - case a_f_TAH: - return a_m_TAH; - case a_f_ZAH: - return a_m_ZAH; - case a_f_AIN: - return a_m_AIN; - case a_f_GHAIN: - return a_m_GHAIN; - case a_f_FEH: - return a_m_FEH; - case a_f_QAF: - return a_m_QAF; - case a_f_KAF: - return a_m_KAF; - case a_f_LAM: - return a_m_LAM; - case a_f_MEEM: - return a_m_MEEM; - case a_f_NOON: - return a_m_NOON; - case a_f_HEH: - return a_m_HEH; - case a_f_YEH: - return a_m_YEH; - // NOTE: these encodings are multi-positional, no ? - // case a_f_LAM_ALEF_MADDA_ABOVE: - // case a_f_LAM_ALEF_HAMZA_ABOVE: - // case a_f_LAM_ALEF_HAMZA_BELOW: - // case a_f_LAM_ALEF: + if (one == a_LAM) { + return arabic_maycombine(two); } - return 0; + return false; } -// Change shape - from Combination (2 char) to an Isolated. -static int chg_c_laa2i(int hid_c) +/// A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character +/// (alphabet/number/punctuation) +static int A_is_iso(int c) { - switch (hid_c) { - case a_ALEF_MADDA: - return a_s_LAM_ALEF_MADDA_ABOVE; - case a_ALEF_HAMZA_ABOVE: - return a_s_LAM_ALEF_HAMZA_ABOVE; - case a_ALEF_HAMZA_BELOW: - return a_s_LAM_ALEF_HAMZA_BELOW; - case a_ALEF: - return a_s_LAM_ALEF; - } - return 0; + return find_achar(c) != NULL; } -// Change shape - from Combination-Isolated to Final. -static int chg_c_laa2f(int hid_c) +/// A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B) +static int A_is_ok(int c) { - switch (hid_c) { - case a_ALEF_MADDA: - return a_f_LAM_ALEF_MADDA_ABOVE; - case a_ALEF_HAMZA_ABOVE: - return a_f_LAM_ALEF_HAMZA_ABOVE; - case a_ALEF_HAMZA_BELOW: - return a_f_LAM_ALEF_HAMZA_BELOW; - case a_ALEF: - return a_f_LAM_ALEF; - } - return 0; + return (A_is_iso(c) || c == a_BYTE_ORDER_MARK); } -// Do "half-shaping" on character "c". Return zero if no shaping. -static int half_shape(int c) +/// A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B) +/// with some exceptions/exclusions +static int A_is_valid(int c) { - if (A_is_a(c)) { - return chg_c_a2i(c); - } - - if (A_is_valid(c) && A_is_f(c)) { - return chg_c_f2m(c); - } - return 0; + return (A_is_ok(c) && c != a_HAMZA); } // Do Arabic shaping on character "c". Returns the shaped character. @@ -916,37 +303,35 @@ int arabic_shape(int c, int *ccp, int *c1p, int prev_c, int prev_c1, int next_c) return c; } - // half-shape current and previous character - int shape_c = half_shape(prev_c); - int curr_c; - int curr_laa = A_firstc_laa(c, *c1p); - int prev_laa = A_firstc_laa(prev_c, prev_c1); + int curr_laa = arabic_combine(c, *c1p); + int prev_laa = arabic_combine(prev_c, prev_c1); if (curr_laa) { - if (A_is_valid(prev_c) && !A_is_f(shape_c) && !A_is_s(shape_c) - && !prev_laa) { - curr_c = chg_c_laa2f(curr_laa); + if (A_is_valid(prev_c) && can_join(prev_c, a_LAM) && !prev_laa) { + curr_c = chg_c_laa2f(*c1p); } else { - curr_c = chg_c_laa2i(curr_laa); + curr_c = chg_c_laa2i(*c1p); } - // Remove the composing character *c1p = 0; - } else if (!A_is_valid(prev_c) && A_is_valid(next_c)) { - curr_c = chg_c_a2i(c); - } else if (!shape_c || A_is_f(shape_c) || A_is_s(shape_c) || prev_laa) { - curr_c = A_is_valid(next_c) ? chg_c_a2i(c) : chg_c_a2s(c); - } else if (A_is_valid(next_c)) { -#if 0 - curr_c = A_is_iso(c) ? chg_c_a2m(c) : chg_c_i2m(c); -#else - curr_c = A_is_iso(c) ? chg_c_a2m(c) : 0; -#endif - } else if (A_is_valid(prev_c)) { - curr_c = chg_c_a2f(c); } else { - curr_c = chg_c_a2s(c); + struct achar *curr_a = find_achar(c); + int backward_combine = !prev_laa && can_join(prev_c, c); + int forward_combine = can_join(c, next_c); + + if (backward_combine && forward_combine) { + curr_c = (int)curr_a->medial; + } + if (backward_combine && !forward_combine) { + curr_c = (int)curr_a->final; + } + if (!backward_combine && forward_combine) { + curr_c = (int)curr_a->initial; + } + if (!backward_combine && !forward_combine) { + curr_c = (int)curr_a->isolated; + } } // Sanity check -- curr_c should, in the future, never be 0. @@ -966,88 +351,3 @@ int arabic_shape(int c, int *ccp, int *c1p, int prev_c, int prev_c1, int next_c) // Return the shaped character return curr_c; } - -/// Check whether we are dealing with Arabic combining characters. -/// Note: these are NOT really composing characters! -/// -/// @param one First character. -/// @param two Character just after "one". -bool arabic_combine(int one, int two) - FUNC_ATTR_PURE -{ - if (one == a_LAM) { - return arabic_maycombine(two); - } - return false; -} - -/// Check whether we are dealing with a character that could be regarded as an -/// Arabic combining character, need to check the character before this. -bool arabic_maycombine(int two) - FUNC_ATTR_PURE -{ - if (p_arshape && !p_tbidi) { - return two == a_ALEF_MADDA - || two == a_ALEF_HAMZA_ABOVE - || two == a_ALEF_HAMZA_BELOW - || two == a_ALEF; - } - return false; -} - -// A_firstc_laa returns first character of LAA combination if it exists -// in: "c" base character -// in: "c1" first composing character -static int A_firstc_laa(int c, int c1) -{ - if ((c1 != NUL) && (c == a_LAM) && !A_is_harakat(c1)) { - return c1; - } - return 0; -} - -// A_is_harakat returns true if 'c' is an Arabic Harakat character. -// (harakat/tanween) -static bool A_is_harakat(int c) -{ - return c >= a_FATHATAN && c <= a_SUKUN; -} - -// A_is_iso returns true if 'c' is an Arabic ISO-8859-6 character. -// (alphabet/number/punctuation) -static bool A_is_iso(int c) -{ - return ((c >= a_HAMZA && c <= a_GHAIN) - || (c >= a_TATWEEL && c <= a_HAMZA_BELOW) - || c == a_MINI_ALEF); -} - -// A_is_formb returns true if 'c' is an Arabic 10646-1 FormB character. -// (alphabet/number/punctuation) -static bool A_is_formb(int c) -{ - return ((c >= a_s_FATHATAN && c <= a_s_DAMMATAN) - || c == a_s_KASRATAN - || (c >= a_s_FATHA && c <= a_f_LAM_ALEF) - || c == a_BYTE_ORDER_MARK); -} - -// A_is_ok returns true if 'c' is an Arabic 10646 (8859-6 or Form-B). -static bool A_is_ok(int c) -{ - return A_is_iso(c) || A_is_formb(c); -} - -// A_is_valid returns true if 'c' is an Arabic 10646 (8859-6 or Form-B), -// with some exceptions/exclusions. -static bool A_is_valid(int c) -{ - return A_is_ok(c) && !A_is_special(c); -} - -// A_is_special returns true if 'c' is not a special Arabic character. -// Specials don't adhere to most of the rules. -static bool A_is_special(int c) -{ - return c == a_HAMZA || c == a_s_HAMZA; -} diff --git a/src/nvim/arabic.h b/src/nvim/arabic.h index eaab463777..3c34de1449 100644 --- a/src/nvim/arabic.h +++ b/src/nvim/arabic.h @@ -3,12 +3,7 @@ #include <stdbool.h> -/// Whether c belongs to the range of Arabic characters that might be shaped. -static inline bool arabic_char(int c) -{ - // return c >= a_HAMZA && c <= a_MINI_ALEF; - return c >= 0x0621 && c <= 0x0670; -} +#define ARABIC_CHAR(ch) (((ch) & 0xFF00) == 0x0600) #ifdef INCLUDE_GENERATED_DECLARATIONS # include "arabic.h.generated.h" diff --git a/src/nvim/ex_getln.c b/src/nvim/ex_getln.c index 7e22ed55cb..32977569c3 100644 --- a/src/nvim/ex_getln.c +++ b/src/nvim/ex_getln.c @@ -3224,7 +3224,7 @@ static void draw_cmdline(int start, int len) int u8cc[MAX_MCO]; int u8c = utfc_ptr2char_len(p, u8cc, start + len - i); mb_l = utfc_ptr2len_len(p, start + len - i); - if (arabic_char(u8c)) { + if (ARABIC_CHAR(u8c)) { do_arabicshape = true; break; } @@ -3260,7 +3260,7 @@ static void draw_cmdline(int start, int len) int u8cc[MAX_MCO]; int u8c = utfc_ptr2char_len(p, u8cc, start + len - i); mb_l = utfc_ptr2len_len(p, start + len - i); - if (arabic_char(u8c)) { + if (ARABIC_CHAR(u8c)) { int pc; int pc1 = 0; int nc = 0; diff --git a/src/nvim/grid.c b/src/nvim/grid.c index 7d407bd3d1..1268f987e1 100644 --- a/src/nvim/grid.c +++ b/src/nvim/grid.c @@ -241,7 +241,7 @@ void grid_puts_len(ScreenGrid *grid, char_u *text, int textlen, int row, int col u8c = utfc_ptr2char(ptr, u8cc); } mbyte_cells = utf_char2cells(u8c); - if (p_arshape && !p_tbidi && arabic_char(u8c)) { + if (p_arshape && !p_tbidi && ARABIC_CHAR(u8c)) { // Do Arabic shaping. if (len >= 0 && (int)(ptr - text) + mbyte_blen >= len) { // Past end of string to be displayed. diff --git a/src/nvim/screen.c b/src/nvim/screen.c index 32e2d515e1..03d7cb1783 100644 --- a/src/nvim/screen.c +++ b/src/nvim/screen.c @@ -1869,7 +1869,7 @@ static int line_putchar(buf_T *buf, LineState *s, schar_T *dest, int maxcells, b schar_from_ascii(dest[0], *p); s->prev_c = u8c; } else { - if (p_arshape && !p_tbidi && arabic_char(u8c)) { + if (p_arshape && !p_tbidi && ARABIC_CHAR(u8c)) { // Do Arabic shaping. int pc, pc1, nc; int pcc[MAX_MCO]; @@ -3157,7 +3157,7 @@ static int win_line(win_T *wp, linenr_T lnum, int startrow, int endrow, bool noc } } else if (mb_l == 0) { // at the NUL at end-of-line mb_l = 1; - } else if (p_arshape && !p_tbidi && arabic_char(mb_c)) { + } else if (p_arshape && !p_tbidi && ARABIC_CHAR(mb_c)) { // Do Arabic shaping. int pc, pc1, nc; int pcc[MAX_MCO]; |