//! Types supporting the UTF-8 parser /// Action to take when receiving a byte #[derive(Debug, Copy, Clone)] pub enum Action { /// Unexpected byte; sequence is invalid InvalidSequence = 0, /// Received valid 7-bit ASCII byte which can be directly emitted. EmitByte = 1, /// Set the bottom continuation byte SetByte1 = 2, /// Set the 2nd-from-last continuation byte SetByte2 = 3, /// Set the 2nd-from-last byte which is part of a two byte sequence SetByte2Top = 4, /// Set the 3rd-from-last continuation byte SetByte3 = 5, /// Set the 3rd-from-last byte which is part of a three byte sequence SetByte3Top = 6, /// Set the top byte of a four byte sequence. SetByte4 = 7, } /// States the parser can be in. /// /// There is a state for each initial input of the 3 and 4 byte sequences since /// the following bytes are subject to different conditions than a tail byte. #[allow(non_camel_case_types)] #[derive(Debug, Copy, Clone)] pub enum State { /// Ground state; expect anything Ground = 0, /// 3 tail bytes Tail3 = 1, /// 2 tail bytes Tail2 = 2, /// 1 tail byte Tail1 = 3, /// UTF8-3 starting with E0 U3_2_e0 = 4, /// UTF8-3 starting with ED U3_2_ed = 5, /// UTF8-4 starting with F0 Utf8_4_3_f0 = 6, /// UTF8-4 starting with F4 Utf8_4_3_f4 = 7, } impl Default for State { fn default() -> State { State::Ground } } impl State { /// Advance the parser state. /// /// This takes the current state and input byte into consideration, to determine the next state /// and any action that should be taken. #[inline] pub fn advance(&self, byte: u8) -> (State, Action) { match self { State::Ground => match byte { 0x00..=0x7f => (State::Ground, Action::EmitByte), 0xc2..=0xdf => (State::Tail1, Action::SetByte2Top), 0xe0 => (State::U3_2_e0, Action::SetByte3Top), 0xe1..=0xec => (State::Tail2, Action::SetByte3Top), 0xed => (State::U3_2_ed, Action::SetByte3Top), 0xee..=0xef => (State::Tail2, Action::SetByte3Top), 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), 0xf1..=0xf3 => (State::Tail3, Action::SetByte4), 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), _ => (State::Ground, Action::InvalidSequence), }, State::U3_2_e0 => match byte { 0xa0..=0xbf => (State::Tail1, Action::SetByte2), _ => (State::Ground, Action::InvalidSequence), }, State::U3_2_ed => match byte { 0x80..=0x9f => (State::Tail1, Action::SetByte2), _ => (State::Ground, Action::InvalidSequence), }, State::Utf8_4_3_f0 => match byte { 0x90..=0xbf => (State::Tail2, Action::SetByte3), _ => (State::Ground, Action::InvalidSequence), }, State::Utf8_4_3_f4 => match byte { 0x80..=0x8f => (State::Tail2, Action::SetByte3), _ => (State::Ground, Action::InvalidSequence), }, State::Tail3 => match byte { 0x80..=0xbf => (State::Tail2, Action::SetByte3), _ => (State::Ground, Action::InvalidSequence), }, State::Tail2 => match byte { 0x80..=0xbf => (State::Tail1, Action::SetByte2), _ => (State::Ground, Action::InvalidSequence), }, State::Tail1 => match byte { 0x80..=0xbf => (State::Ground, Action::SetByte1), _ => (State::Ground, Action::InvalidSequence), }, } } }