diff options
author | Joe Wilm <joe@jwilm.com> | 2016-09-17 15:51:45 -0700 |
---|---|---|
committer | Joe Wilm <joe@jwilm.com> | 2016-09-17 17:03:20 -0700 |
commit | cffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch) | |
tree | 26603abf607d21eefd3b9a6ac79a36dab63b5781 /src | |
parent | 930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff) | |
download | r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2 r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip |
Add support for UTF-8
This adds a table-driven UTF-8 parser which only has a single branch for
the entire parser. UTF-8 support is essentially bolted onto the VTE
parser. Not the most elegant, but it does prevent the transition tables
from blowing up.
Instead of refactoring the syntax extension to handle both table
definitions, I've opted to copy/paste now for both simplicities sake and
because I can't see a clear path to a minimal shared solution.
Diffstat (limited to 'src')
-rw-r--r-- | src/definitions.rs | 8 | ||||
-rw-r--r-- | src/lib.rs | 49 | ||||
-rw-r--r-- | src/table.rs | 19 | ||||
-rw-r--r-- | src/table.rs.in | 14 | ||||
-rw-r--r-- | src/utf8/mod.rs | 91 | ||||
-rw-r--r-- | src/utf8/table.rs | 184 | ||||
-rw-r--r-- | src/utf8/table.rs.in | 60 | ||||
-rw-r--r-- | src/utf8/types.rs | 77 |
8 files changed, 480 insertions, 22 deletions
diff --git a/src/definitions.rs b/src/definitions.rs index 5177ca6..ded49cf 100644 --- a/src/definitions.rs +++ b/src/definitions.rs @@ -15,7 +15,7 @@ pub enum State { Ground = 12, OscString = 13, SosPmApcString = 14, - Unused__ = 15, + Utf8 = 15, } #[derive(Debug, Clone, Copy)] @@ -35,7 +35,7 @@ pub enum Action { Print = 12, Put = 13, Unhook = 14, - Unused__ = 15, + BeginUtf8 = 15, } /// Unpack a u8 into a State and Action @@ -67,12 +67,12 @@ mod tests { } match unpack(0x0f) { - (State::Unused__, Action::None) => (), + (State::Utf8, Action::None) => (), _ => panic!("unpack failed"), } match unpack(0xff) { - (State::Unused__, Action::Unused__) => (), + (State::Utf8, Action::BeginUtf8) => (), _ => panic!("unpack failed"), } } @@ -1,5 +1,6 @@ mod table; mod definitions; +mod utf8; pub use definitions::{Action, State, unpack}; @@ -27,6 +28,20 @@ impl State { const MAX_INTERMEDIATES: usize = 2; const MAX_PARAMS: usize = 16; +struct VtUtf8Receiver<'a, P: Parser + 'a>(&'a mut P, &'a mut State); + +impl<'a, P: Parser> utf8::Receiver for VtUtf8Receiver<'a, P> { + fn codepoint(&mut self, c: char) { + self.0.print(c); + *self.1 = State::Ground; + } + + fn invalid_sequence(&mut self) { + self.0.print('�'); + *self.1 = State::Ground; + } +} + /// ANSI VTE Parser /// /// As described in http://vt100.net/emu/dec_ansi_parser @@ -38,7 +53,8 @@ pub struct StateMachine { intermediate_idx: usize, params: [i64; MAX_PARAMS], num_params: usize, - ignoring: bool + ignoring: bool, + utf8_parser: utf8::Parser, } impl StateMachine { @@ -50,6 +66,7 @@ impl StateMachine { params: [0i64; MAX_PARAMS], num_params: 0, ignoring: false, + utf8_parser: utf8::Parser::new(), } } @@ -62,6 +79,12 @@ impl StateMachine { } pub fn advance<P: Parser>(&mut self, parser: &mut P, byte: u8) { + // Utf8 characters are handled out-of-band. + if let State::Utf8 = self.state { + self.process_utf8(parser, byte); + return; + } + // Handle state changes in the anywhere state before evaluating changes // for current state. let mut change = STATE_CHANGE[State::Anywhere as usize][byte as usize]; @@ -76,13 +99,22 @@ impl StateMachine { self.perform_state_change(parser, state, action, byte); } + #[inline] + fn process_utf8<P>(&mut self, parser: &mut P, byte: u8) + where P: Parser + { + let mut receiver = VtUtf8Receiver(parser, &mut self.state); + let utf8_parser = &mut self.utf8_parser; + utf8_parser.advance(&mut receiver, byte); + } + fn perform_state_change<P>(&mut self, parser: &mut P, state: State, action: Action, byte: u8) where P: Parser { macro_rules! maybe_action { ($action:expr, $arg:expr) => { match $action { - Action::None | Action::Unused__ => (), + Action::None => (), action => { self.perform_action(parser, action, $arg); }, @@ -91,7 +123,7 @@ impl StateMachine { } match state { - State::Anywhere | State::Unused__ => { + State::Anywhere => { // Just run the action self.perform_action(parser, action, byte); }, @@ -114,7 +146,7 @@ impl StateMachine { fn perform_action<P: Parser>(&mut self, parser: &mut P, action: Action, byte: u8) { match action { - Action::Print => parser.print(self, byte as char), + Action::Print => parser.print(byte as char), Action::Execute => parser.execute(self, byte), Action::Hook => parser.hook(self, byte), Action::Put => parser.put(self, byte), @@ -124,7 +156,7 @@ impl StateMachine { Action::Unhook => parser.unhook(self, byte), Action::CsiDispatch => parser.csi_dispatch(self, byte as char), Action::EscDispatch => parser.esc_dispatch(self, byte), - Action::Ignore | Action::None | Action::Unused__=> (), + Action::Ignore | Action::None => (), Action::Collect => { if self.intermediate_idx == MAX_INTERMEDIATES { self.ignoring = true; @@ -155,13 +187,16 @@ impl StateMachine { self.intermediate_idx = 0; self.num_params = 0; self.ignoring = false; - } + }, + Action::BeginUtf8 => { + self.process_utf8(parser, byte); + }, } } } pub trait Parser { - fn print(&mut self, &StateMachine, c: char); + fn print(&mut self, c: char); fn execute(&mut self, &StateMachine, byte: u8); fn hook(&mut self, &StateMachine, byte: u8); fn put(&mut self, &StateMachine, byte: u8); diff --git a/src/table.rs b/src/table.rs index 923c7eb..d2034b8 100644 --- a/src/table.rs +++ b/src/table.rs @@ -6,6 +6,9 @@ use definitions::Action; pub static STATE_CHANGE: [[u8; 256]; 16] = [ + // Beginning of UTF-8 2 byte sequence + // Beginning of UTF-8 3 byte sequence + // Beginning of UTF-8 4 byte sequence @@ -280,11 +283,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = 80u8, 80u8, 80u8, 80u8, 80u8, 80u8, 0u8, 80u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, - 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + 0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, + 255u8, 255u8, 255u8, 255u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8], [112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 0u8, 112u8, 0u8, 0u8, 112u8, 112u8, 112u8, @@ -366,7 +371,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = Action::OscStart, // State::OscString Action::None, // State::SosPmApcString Action::None]; - // State::Unused__ + // State::Utf8 pub static EXIT_ACTIONS: &'static [Action] = &[Action::None, // State::Anywhere @@ -384,4 +389,4 @@ pub static EXIT_ACTIONS: &'static [Action] = Action::None, // State::Ground Action::OscEnd, // State::OscString Action::None, // State::SosPmApcString - Action::None]; // State::Unused__ + Action::None]; // State::Utf8 diff --git a/src/table.rs.in b/src/table.rs.in index 7414c2a..f5a838d 100644 --- a/src/table.rs.in +++ b/src/table.rs.in @@ -3,7 +3,7 @@ use definitions::Action; -pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! { +pub static STATE_CHANGE: [[u8; 256]; 16] = vt_state_table! { State::Anywhere => { 0x18 => (Action::Execute, State::Ground), 0x1a => (Action::Execute, State::Ground), @@ -28,7 +28,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! { 0x20...0x7f => Action::Print, 0x80...0x8f => Action::Execute, 0x91...0x9a => Action::Execute, - 0x9c => Action::Execute + 0x9c => Action::Execute, + // Beginning of UTF-8 2 byte sequence + 0xc2...0xdf => (State::Utf8, Action::BeginUtf8), + // Beginning of UTF-8 3 byte sequence + 0xe0...0xef => (State::Utf8, Action::BeginUtf8), + // Beginning of UTF-8 4 byte sequence + 0xf0...0xf4 => (State::Utf8, Action::BeginUtf8), }, State::Escape => { @@ -191,7 +197,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = &[ Action::None, // State::Ground Action::OscStart, // State::OscString Action::None, // State::SosPmApcString - Action::None, // State::Unused__ + Action::None, // State::Utf8 ]; pub static EXIT_ACTIONS: &'static [Action] = &[ @@ -210,5 +216,5 @@ pub static EXIT_ACTIONS: &'static [Action] = &[ Action::None, // State::Ground Action::OscEnd, // State::OscString Action::None, // State::SosPmApcString - Action::None, // State::Unused__ + Action::None, // State::Utf8 ]; diff --git a/src/utf8/mod.rs b/src/utf8/mod.rs new file mode 100644 index 0000000..3d099b1 --- /dev/null +++ b/src/utf8/mod.rs @@ -0,0 +1,91 @@ +//! A table-driven UTF-8 Parser +//! +//! This module implements a table-driven UTF-8 parser which should +//! theoretically contain the minimal number of branches (1). The only branch is +//! on the `Action` returned from unpacking a transition. +use std::char; + +mod types; +use self::types::{State, Action, unpack}; + +mod table; +use self::table::TRANSITIONS; + +/// Handles codepoint and invalid sequence events from the parser. +pub trait Receiver { + /// Code point parsed + /// + /// Called with the codepoint + fn codepoint(&mut self, char); + + /// Invalid sequence encountered + fn invalid_sequence(&mut self); +} + +/// A parser for Utf8 Characters +/// +/// Repeatedly call `advance` with bytes to emit Utf8 characters +pub struct Parser { + point: u32, + state: State, +} + +/// Continuation bytes are masked with this value. +const CONTINUATION_MASK: u8 = 0b0011_1111; + +impl Parser { + /// Create a new Parser + pub fn new() -> Parser { + Parser { + point: 0, + state: State::Ground, + } + } + + pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) + where R: Receiver + { + let cur = self.state as usize; + let change = TRANSITIONS[cur][byte as usize]; + let (state, action) = unsafe { unpack(change) }; + + self.perform_action(receiver, byte, action); + self.state = state; + } + + fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) + where R: Receiver + { + match action { + Action::InvalidSequence => { + self.point = 0; + receiver.invalid_sequence(); + }, + Action::EmitByte => { + receiver.codepoint(byte as char); + }, + Action::SetByte1 => { + let point = self.point | ((byte & CONTINUATION_MASK) as u32); + let c = unsafe { char::from_u32_unchecked(point) }; + self.point = 0; + + receiver.codepoint(c); + }, + Action::SetByte2 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; + }, + Action::SetByte2Top => { + self.point |= ((byte & 0b0001_1111) as u32) << 6; + }, + Action::SetByte3 => { + self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; + }, + Action::SetByte3Top => { + self.point |= ((byte & 0b0000_1111) as u32) << 12; + }, + Action::SetByte4 => { + self.point |= ((byte & 0b0000_0111) as u32) << 18; + }, + } + } +} diff --git a/src/utf8/table.rs b/src/utf8/table.rs new file mode 100644 index 0000000..5a1292b --- /dev/null +++ b/src/utf8/table.rs @@ -0,0 +1,184 @@ +//! UTF-8 Parse Transition Table + +/// Transition table for parsing UTF-8. This is built from the grammar described +/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and +/// formatted below. +/// +/// # UTF-8 Grammar +/// +/// ```ignore +/// UTF8-octets = *( UTF8-char ) +/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +/// UTF8-1 = %x00-7F +/// UTF8-2 = %xC2-DF UTF8-tail +/// UTF8-3 = %xE0 %xA0-BF UTF8-tail / +/// %xE1-EC 2( UTF8-tail ) / +/// %xED %x80-9F UTF8-tail / +/// %xEE-EF 2( UTF8-tail ) +/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / +/// %xF1-F3 3( UTF8-tail ) / +/// %xF4 %x80-8F 2( UTF8-tail ) +/// UTF8-tail = %x80-BF +/// ``` +/// +/// Not specifying an action in this table is equivalent to specifying +/// Action::InvalidSequence. Not specifying a state is equivalent to specifying +/// state::ground. +pub static TRANSITIONS: [[u8; 256]; 8] = + [[16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, + 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, + 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, + 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 100u8, 98u8, + 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 101u8, + 98u8, 98u8, 118u8, 113u8, 113u8, 113u8, 119u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, + 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, + 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8], + [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, + 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, + 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]]; diff --git a/src/utf8/table.rs.in b/src/utf8/table.rs.in new file mode 100644 index 0000000..2acafe7 --- /dev/null +++ b/src/utf8/table.rs.in @@ -0,0 +1,60 @@ +//! UTF-8 Parse Transition Table + +/// Transition table for parsing UTF-8. This is built from the grammar described +/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and +/// formatted below. +/// +/// # UTF-8 Grammar +/// +/// ```ignore +/// UTF8-octets = *( UTF8-char ) +/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 +/// UTF8-1 = %x00-7F +/// UTF8-2 = %xC2-DF UTF8-tail +/// UTF8-3 = %xE0 %xA0-BF UTF8-tail / +/// %xE1-EC 2( UTF8-tail ) / +/// %xED %x80-9F UTF8-tail / +/// %xEE-EF 2( UTF8-tail ) +/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / +/// %xF1-F3 3( UTF8-tail ) / +/// %xF4 %x80-8F 2( UTF8-tail ) +/// UTF8-tail = %x80-BF +/// ``` +/// +/// Not specifying an action in this table is equivalent to specifying +/// Action::InvalidSequence. Not specifying a state is equivalent to specifying +/// state::ground. +pub static TRANSITIONS: [[u8; 256]; 8] = utf8_state_table! { + State::Ground => { + 0x00...0x7f => (State::Ground, Action::EmitByte), + 0xc2...0xdf => (State::Tail1, Action::SetByte2Top), + 0xe0 => (State::U3_2_e0, Action::SetByte3Top), + 0xe1...0xec => (State::Tail2, Action::SetByte3Top), + 0xed => (State::U3_2_ed, Action::SetByte3Top), + 0xee...0xef => (State::Tail2, Action::SetByte3Top), + 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), + 0xf1...0xf3 => (State::Tail3, Action::SetByte4), + 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), + }, + State::U3_2_e0 => { + 0xa0...0xbf => (State::Tail1, Action::SetByte2), + }, + State::U3_2_ed => { + 0x80...0x9f => (State::Tail1, Action::SetByte2), + }, + State::Utf8_4_3_f0 => { + 0x90...0xbf => (State::Tail2, Action::SetByte3), + }, + State::Utf8_4_3_f4 => { + 0x80...0x8f => (State::Tail2, Action::SetByte3), + }, + State::Tail3 => { + 0x80...0xbf => (State::Tail2, Action::SetByte3), + }, + State::Tail2 => { + 0x80...0xbf => (State::Tail1, Action::SetByte2), + }, + State::Tail1 => { + 0x80...0xbf => (State::Ground, Action::SetByte1), + }, +}; diff --git a/src/utf8/types.rs b/src/utf8/types.rs new file mode 100644 index 0000000..4c604f4 --- /dev/null +++ b/src/utf8/types.rs @@ -0,0 +1,77 @@ +//! Types supporting the UTF-8 parser +#![allow(non_camel_case_types)] +use std::mem; + +/// States the parser can be in. +/// +/// There is a state for each initial input of the 3 and 4 byte sequences since +/// the following bytes are subject to different conditions than a tail byte. +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum State { + /// Ground state; expect anything + Ground = 0, + /// 3 tail bytes + Tail3 = 1, + /// 2 tail bytes + Tail2 = 2, + /// 1 tail byte + Tail1 = 3, + /// UTF8-3 starting with E0 + U3_2_e0 = 4, + /// UTF8-3 starting with ED + U3_2_ed = 5, + /// UTF8-4 starting with F0 + Utf8_4_3_f0 = 6, + /// UTF8-4 starting with F4 + Utf8_4_3_f4 = 7, +} + +/// Action to take when receiving a byte +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum Action { + /// Unexpected byte; sequence is invalid + InvalidSequence = 0, + /// Received valid 7-bit ASCII byte which can be directly emitted. + EmitByte = 1, + /// Set the bottom continuation byte + SetByte1 = 2, + /// Set the 2nd-from-last continuation byte + SetByte2 = 3, + /// Set the 2nd-from-last byte which is part of a two byte sequence + SetByte2Top = 4, + /// Set the 3rd-from-last continuation byte + SetByte3 = 5, + /// Set the 3rd-from-last byte which is part of a three byte sequence + SetByte3Top = 6, + /// Set the top byte of a four byte sequence. + SetByte4 = 7, +} + +/// Convert a state and action to a u8 +/// +/// State will be the bottom 4 bits and action the top 4 +#[inline] +#[allow(dead_code)] +pub fn pack(state: State, action: Action) -> u8 { + ((action as u8) << 4) | (state as u8) +} + +/// Convert a u8 to a state and action +/// +/// # Unsafety +/// +/// If this function is called with a byte that wasn't encoded with the `pack` +/// function in this module, there is no guarantee that a valid state and action +/// can be produced. +#[inline] +pub unsafe fn unpack(val: u8) -> (State, Action) { + ( + // State is stored in bottom 4 bits + mem::transmute(val & 0x0f), + + // Action is stored in top 4 bits + mem::transmute(val >> 4), + ) +} |