aboutsummaryrefslogtreecommitdiff
path: root/utf8parse/src/table.rs.in
blob: 2acafe70fe395dda46d3943cbb0389705535837d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
//! UTF-8 Parse Transition Table

/// Transition table for parsing UTF-8. This is built from the grammar described
/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and
/// formatted below.
///
/// # UTF-8 Grammar
///
/// ```ignore
///     UTF8-octets = *( UTF8-char )
///     UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
///     UTF8-1      = %x00-7F
///     UTF8-2      = %xC2-DF UTF8-tail
///     UTF8-3      = %xE0    %xA0-BF UTF8-tail /
///                   %xE1-EC 2( UTF8-tail )    /
///                   %xED    %x80-9F UTF8-tail /
///                   %xEE-EF 2( UTF8-tail )
///     UTF8-4      = %xF0    %x90-BF 2( UTF8-tail ) /
///                   %xF1-F3 3( UTF8-tail )         /
///                   %xF4    %x80-8F 2( UTF8-tail )
///     UTF8-tail   = %x80-BF
/// ```
///
/// Not specifying an action in this table is equivalent to specifying
/// Action::InvalidSequence. Not specifying a state is equivalent to specifying
/// state::ground.
pub static TRANSITIONS: [[u8; 256]; 8] = utf8_state_table! {
    State::Ground => {
        0x00...0x7f => (State::Ground,      Action::EmitByte),
        0xc2...0xdf => (State::Tail1,       Action::SetByte2Top),
        0xe0        => (State::U3_2_e0,     Action::SetByte3Top),
        0xe1...0xec => (State::Tail2,       Action::SetByte3Top),
        0xed        => (State::U3_2_ed,     Action::SetByte3Top),
        0xee...0xef => (State::Tail2,       Action::SetByte3Top),
        0xf0        => (State::Utf8_4_3_f0, Action::SetByte4),
        0xf1...0xf3 => (State::Tail3,       Action::SetByte4),
        0xf4        => (State::Utf8_4_3_f4, Action::SetByte4),
    },
    State::U3_2_e0 => {
        0xa0...0xbf => (State::Tail1, Action::SetByte2),
    },
    State::U3_2_ed => {
        0x80...0x9f => (State::Tail1, Action::SetByte2),
    },
    State::Utf8_4_3_f0 => {
        0x90...0xbf => (State::Tail2, Action::SetByte3),
    },
    State::Utf8_4_3_f4 => {
        0x80...0x8f => (State::Tail2, Action::SetByte3),
    },
    State::Tail3 => {
        0x80...0xbf => (State::Tail2, Action::SetByte3),
    },
    State::Tail2 => {
        0x80...0xbf => (State::Tail1, Action::SetByte2),
    },
    State::Tail1 => {
        0x80...0xbf => (State::Ground, Action::SetByte1),
    },
};