aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/definitions.rs8
-rw-r--r--src/lib.rs49
-rw-r--r--src/table.rs19
-rw-r--r--src/table.rs.in14
-rw-r--r--src/utf8/mod.rs91
-rw-r--r--src/utf8/table.rs184
-rw-r--r--src/utf8/table.rs.in60
-rw-r--r--src/utf8/types.rs77
8 files changed, 480 insertions, 22 deletions
diff --git a/src/definitions.rs b/src/definitions.rs
index 5177ca6..ded49cf 100644
--- a/src/definitions.rs
+++ b/src/definitions.rs
@@ -15,7 +15,7 @@ pub enum State {
Ground = 12,
OscString = 13,
SosPmApcString = 14,
- Unused__ = 15,
+ Utf8 = 15,
}
#[derive(Debug, Clone, Copy)]
@@ -35,7 +35,7 @@ pub enum Action {
Print = 12,
Put = 13,
Unhook = 14,
- Unused__ = 15,
+ BeginUtf8 = 15,
}
/// Unpack a u8 into a State and Action
@@ -67,12 +67,12 @@ mod tests {
}
match unpack(0x0f) {
- (State::Unused__, Action::None) => (),
+ (State::Utf8, Action::None) => (),
_ => panic!("unpack failed"),
}
match unpack(0xff) {
- (State::Unused__, Action::Unused__) => (),
+ (State::Utf8, Action::BeginUtf8) => (),
_ => panic!("unpack failed"),
}
}
diff --git a/src/lib.rs b/src/lib.rs
index 73326bc..17e265e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,5 +1,6 @@
mod table;
mod definitions;
+mod utf8;
pub use definitions::{Action, State, unpack};
@@ -27,6 +28,20 @@ impl State {
const MAX_INTERMEDIATES: usize = 2;
const MAX_PARAMS: usize = 16;
+struct VtUtf8Receiver<'a, P: Parser + 'a>(&'a mut P, &'a mut State);
+
+impl<'a, P: Parser> utf8::Receiver for VtUtf8Receiver<'a, P> {
+ fn codepoint(&mut self, c: char) {
+ self.0.print(c);
+ *self.1 = State::Ground;
+ }
+
+ fn invalid_sequence(&mut self) {
+ self.0.print('�');
+ *self.1 = State::Ground;
+ }
+}
+
/// ANSI VTE Parser
///
/// As described in http://vt100.net/emu/dec_ansi_parser
@@ -38,7 +53,8 @@ pub struct StateMachine {
intermediate_idx: usize,
params: [i64; MAX_PARAMS],
num_params: usize,
- ignoring: bool
+ ignoring: bool,
+ utf8_parser: utf8::Parser,
}
impl StateMachine {
@@ -50,6 +66,7 @@ impl StateMachine {
params: [0i64; MAX_PARAMS],
num_params: 0,
ignoring: false,
+ utf8_parser: utf8::Parser::new(),
}
}
@@ -62,6 +79,12 @@ impl StateMachine {
}
pub fn advance<P: Parser>(&mut self, parser: &mut P, byte: u8) {
+ // Utf8 characters are handled out-of-band.
+ if let State::Utf8 = self.state {
+ self.process_utf8(parser, byte);
+ return;
+ }
+
// Handle state changes in the anywhere state before evaluating changes
// for current state.
let mut change = STATE_CHANGE[State::Anywhere as usize][byte as usize];
@@ -76,13 +99,22 @@ impl StateMachine {
self.perform_state_change(parser, state, action, byte);
}
+ #[inline]
+ fn process_utf8<P>(&mut self, parser: &mut P, byte: u8)
+ where P: Parser
+ {
+ let mut receiver = VtUtf8Receiver(parser, &mut self.state);
+ let utf8_parser = &mut self.utf8_parser;
+ utf8_parser.advance(&mut receiver, byte);
+ }
+
fn perform_state_change<P>(&mut self, parser: &mut P, state: State, action: Action, byte: u8)
where P: Parser
{
macro_rules! maybe_action {
($action:expr, $arg:expr) => {
match $action {
- Action::None | Action::Unused__ => (),
+ Action::None => (),
action => {
self.perform_action(parser, action, $arg);
},
@@ -91,7 +123,7 @@ impl StateMachine {
}
match state {
- State::Anywhere | State::Unused__ => {
+ State::Anywhere => {
// Just run the action
self.perform_action(parser, action, byte);
},
@@ -114,7 +146,7 @@ impl StateMachine {
fn perform_action<P: Parser>(&mut self, parser: &mut P, action: Action, byte: u8) {
match action {
- Action::Print => parser.print(self, byte as char),
+ Action::Print => parser.print(byte as char),
Action::Execute => parser.execute(self, byte),
Action::Hook => parser.hook(self, byte),
Action::Put => parser.put(self, byte),
@@ -124,7 +156,7 @@ impl StateMachine {
Action::Unhook => parser.unhook(self, byte),
Action::CsiDispatch => parser.csi_dispatch(self, byte as char),
Action::EscDispatch => parser.esc_dispatch(self, byte),
- Action::Ignore | Action::None | Action::Unused__=> (),
+ Action::Ignore | Action::None => (),
Action::Collect => {
if self.intermediate_idx == MAX_INTERMEDIATES {
self.ignoring = true;
@@ -155,13 +187,16 @@ impl StateMachine {
self.intermediate_idx = 0;
self.num_params = 0;
self.ignoring = false;
- }
+ },
+ Action::BeginUtf8 => {
+ self.process_utf8(parser, byte);
+ },
}
}
}
pub trait Parser {
- fn print(&mut self, &StateMachine, c: char);
+ fn print(&mut self, c: char);
fn execute(&mut self, &StateMachine, byte: u8);
fn hook(&mut self, &StateMachine, byte: u8);
fn put(&mut self, &StateMachine, byte: u8);
diff --git a/src/table.rs b/src/table.rs
index 923c7eb..d2034b8 100644
--- a/src/table.rs
+++ b/src/table.rs
@@ -6,6 +6,9 @@ use definitions::Action;
pub static STATE_CHANGE: [[u8; 256]; 16] =
[
+ // Beginning of UTF-8 2 byte sequence
+ // Beginning of UTF-8 3 byte sequence
+ // Beginning of UTF-8 4 byte sequence
@@ -280,11 +283,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] =
80u8, 80u8, 80u8, 80u8, 80u8, 80u8, 0u8, 80u8, 0u8, 0u8, 0u8, 0u8, 0u8,
0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
- 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ 0u8, 0u8, 0u8, 0u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8, 255u8,
+ 255u8, 255u8, 255u8, 255u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8],
[112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8,
112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8, 112u8,
112u8, 112u8, 112u8, 112u8, 0u8, 112u8, 0u8, 0u8, 112u8, 112u8, 112u8,
@@ -366,7 +371,7 @@ pub static ENTRY_ACTIONS: &'static [Action] =
Action::OscStart, // State::OscString
Action::None, // State::SosPmApcString
Action::None];
- // State::Unused__
+ // State::Utf8
pub static EXIT_ACTIONS: &'static [Action] =
&[Action::None, // State::Anywhere
@@ -384,4 +389,4 @@ pub static EXIT_ACTIONS: &'static [Action] =
Action::None, // State::Ground
Action::OscEnd, // State::OscString
Action::None, // State::SosPmApcString
- Action::None]; // State::Unused__
+ Action::None]; // State::Utf8
diff --git a/src/table.rs.in b/src/table.rs.in
index 7414c2a..f5a838d 100644
--- a/src/table.rs.in
+++ b/src/table.rs.in
@@ -3,7 +3,7 @@
use definitions::Action;
-pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! {
+pub static STATE_CHANGE: [[u8; 256]; 16] = vt_state_table! {
State::Anywhere => {
0x18 => (Action::Execute, State::Ground),
0x1a => (Action::Execute, State::Ground),
@@ -28,7 +28,13 @@ pub static STATE_CHANGE: [[u8; 256]; 16] = state_table! {
0x20...0x7f => Action::Print,
0x80...0x8f => Action::Execute,
0x91...0x9a => Action::Execute,
- 0x9c => Action::Execute
+ 0x9c => Action::Execute,
+ // Beginning of UTF-8 2 byte sequence
+ 0xc2...0xdf => (State::Utf8, Action::BeginUtf8),
+ // Beginning of UTF-8 3 byte sequence
+ 0xe0...0xef => (State::Utf8, Action::BeginUtf8),
+ // Beginning of UTF-8 4 byte sequence
+ 0xf0...0xf4 => (State::Utf8, Action::BeginUtf8),
},
State::Escape => {
@@ -191,7 +197,7 @@ pub static ENTRY_ACTIONS: &'static [Action] = &[
Action::None, // State::Ground
Action::OscStart, // State::OscString
Action::None, // State::SosPmApcString
- Action::None, // State::Unused__
+ Action::None, // State::Utf8
];
pub static EXIT_ACTIONS: &'static [Action] = &[
@@ -210,5 +216,5 @@ pub static EXIT_ACTIONS: &'static [Action] = &[
Action::None, // State::Ground
Action::OscEnd, // State::OscString
Action::None, // State::SosPmApcString
- Action::None, // State::Unused__
+ Action::None, // State::Utf8
];
diff --git a/src/utf8/mod.rs b/src/utf8/mod.rs
new file mode 100644
index 0000000..3d099b1
--- /dev/null
+++ b/src/utf8/mod.rs
@@ -0,0 +1,91 @@
+//! A table-driven UTF-8 Parser
+//!
+//! This module implements a table-driven UTF-8 parser which should
+//! theoretically contain the minimal number of branches (1). The only branch is
+//! on the `Action` returned from unpacking a transition.
+use std::char;
+
+mod types;
+use self::types::{State, Action, unpack};
+
+mod table;
+use self::table::TRANSITIONS;
+
+/// Handles codepoint and invalid sequence events from the parser.
+pub trait Receiver {
+ /// Code point parsed
+ ///
+ /// Called with the codepoint
+ fn codepoint(&mut self, char);
+
+ /// Invalid sequence encountered
+ fn invalid_sequence(&mut self);
+}
+
+/// A parser for Utf8 Characters
+///
+/// Repeatedly call `advance` with bytes to emit Utf8 characters
+pub struct Parser {
+ point: u32,
+ state: State,
+}
+
+/// Continuation bytes are masked with this value.
+const CONTINUATION_MASK: u8 = 0b0011_1111;
+
+impl Parser {
+ /// Create a new Parser
+ pub fn new() -> Parser {
+ Parser {
+ point: 0,
+ state: State::Ground,
+ }
+ }
+
+ pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
+ where R: Receiver
+ {
+ let cur = self.state as usize;
+ let change = TRANSITIONS[cur][byte as usize];
+ let (state, action) = unsafe { unpack(change) };
+
+ self.perform_action(receiver, byte, action);
+ self.state = state;
+ }
+
+ fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
+ where R: Receiver
+ {
+ match action {
+ Action::InvalidSequence => {
+ self.point = 0;
+ receiver.invalid_sequence();
+ },
+ Action::EmitByte => {
+ receiver.codepoint(byte as char);
+ },
+ Action::SetByte1 => {
+ let point = self.point | ((byte & CONTINUATION_MASK) as u32);
+ let c = unsafe { char::from_u32_unchecked(point) };
+ self.point = 0;
+
+ receiver.codepoint(c);
+ },
+ Action::SetByte2 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
+ },
+ Action::SetByte2Top => {
+ self.point |= ((byte & 0b0001_1111) as u32) << 6;
+ },
+ Action::SetByte3 => {
+ self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
+ },
+ Action::SetByte3Top => {
+ self.point |= ((byte & 0b0000_1111) as u32) << 12;
+ },
+ Action::SetByte4 => {
+ self.point |= ((byte & 0b0000_0111) as u32) << 18;
+ },
+ }
+ }
+}
diff --git a/src/utf8/table.rs b/src/utf8/table.rs
new file mode 100644
index 0000000..5a1292b
--- /dev/null
+++ b/src/utf8/table.rs
@@ -0,0 +1,184 @@
+//! UTF-8 Parse Transition Table
+
+/// Transition table for parsing UTF-8. This is built from the grammar described
+/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and
+/// formatted below.
+///
+/// # UTF-8 Grammar
+///
+/// ```ignore
+/// UTF8-octets = *( UTF8-char )
+/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+/// UTF8-1 = %x00-7F
+/// UTF8-2 = %xC2-DF UTF8-tail
+/// UTF8-3 = %xE0 %xA0-BF UTF8-tail /
+/// %xE1-EC 2( UTF8-tail ) /
+/// %xED %x80-9F UTF8-tail /
+/// %xEE-EF 2( UTF8-tail )
+/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
+/// %xF1-F3 3( UTF8-tail ) /
+/// %xF4 %x80-8F 2( UTF8-tail )
+/// UTF8-tail = %x80-BF
+/// ```
+///
+/// Not specifying an action in this table is equivalent to specifying
+/// Action::InvalidSequence. Not specifying a state is equivalent to specifying
+/// state::ground.
+pub static TRANSITIONS: [[u8; 256]; 8] =
+ [[16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8,
+ 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 16u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8,
+ 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8,
+ 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 67u8, 100u8, 98u8,
+ 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 98u8, 101u8,
+ 98u8, 98u8, 118u8, 113u8, 113u8, 113u8, 119u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 32u8,
+ 32u8, 32u8, 32u8, 32u8, 32u8, 32u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8,
+ 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 51u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8],
+ [0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 82u8,
+ 82u8, 82u8, 82u8, 82u8, 82u8, 82u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8,
+ 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8]];
diff --git a/src/utf8/table.rs.in b/src/utf8/table.rs.in
new file mode 100644
index 0000000..2acafe7
--- /dev/null
+++ b/src/utf8/table.rs.in
@@ -0,0 +1,60 @@
+//! UTF-8 Parse Transition Table
+
+/// Transition table for parsing UTF-8. This is built from the grammar described
+/// at https://tools.ietf.org/html/rfc3629#section-4 which I have copied and
+/// formatted below.
+///
+/// # UTF-8 Grammar
+///
+/// ```ignore
+/// UTF8-octets = *( UTF8-char )
+/// UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+/// UTF8-1 = %x00-7F
+/// UTF8-2 = %xC2-DF UTF8-tail
+/// UTF8-3 = %xE0 %xA0-BF UTF8-tail /
+/// %xE1-EC 2( UTF8-tail ) /
+/// %xED %x80-9F UTF8-tail /
+/// %xEE-EF 2( UTF8-tail )
+/// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) /
+/// %xF1-F3 3( UTF8-tail ) /
+/// %xF4 %x80-8F 2( UTF8-tail )
+/// UTF8-tail = %x80-BF
+/// ```
+///
+/// Not specifying an action in this table is equivalent to specifying
+/// Action::InvalidSequence. Not specifying a state is equivalent to specifying
+/// state::ground.
+pub static TRANSITIONS: [[u8; 256]; 8] = utf8_state_table! {
+ State::Ground => {
+ 0x00...0x7f => (State::Ground, Action::EmitByte),
+ 0xc2...0xdf => (State::Tail1, Action::SetByte2Top),
+ 0xe0 => (State::U3_2_e0, Action::SetByte3Top),
+ 0xe1...0xec => (State::Tail2, Action::SetByte3Top),
+ 0xed => (State::U3_2_ed, Action::SetByte3Top),
+ 0xee...0xef => (State::Tail2, Action::SetByte3Top),
+ 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
+ 0xf1...0xf3 => (State::Tail3, Action::SetByte4),
+ 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
+ },
+ State::U3_2_e0 => {
+ 0xa0...0xbf => (State::Tail1, Action::SetByte2),
+ },
+ State::U3_2_ed => {
+ 0x80...0x9f => (State::Tail1, Action::SetByte2),
+ },
+ State::Utf8_4_3_f0 => {
+ 0x90...0xbf => (State::Tail2, Action::SetByte3),
+ },
+ State::Utf8_4_3_f4 => {
+ 0x80...0x8f => (State::Tail2, Action::SetByte3),
+ },
+ State::Tail3 => {
+ 0x80...0xbf => (State::Tail2, Action::SetByte3),
+ },
+ State::Tail2 => {
+ 0x80...0xbf => (State::Tail1, Action::SetByte2),
+ },
+ State::Tail1 => {
+ 0x80...0xbf => (State::Ground, Action::SetByte1),
+ },
+};
diff --git a/src/utf8/types.rs b/src/utf8/types.rs
new file mode 100644
index 0000000..4c604f4
--- /dev/null
+++ b/src/utf8/types.rs
@@ -0,0 +1,77 @@
+//! Types supporting the UTF-8 parser
+#![allow(non_camel_case_types)]
+use std::mem;
+
+/// States the parser can be in.
+///
+/// There is a state for each initial input of the 3 and 4 byte sequences since
+/// the following bytes are subject to different conditions than a tail byte.
+#[allow(dead_code)]
+#[derive(Debug, Copy, Clone)]
+pub enum State {
+ /// Ground state; expect anything
+ Ground = 0,
+ /// 3 tail bytes
+ Tail3 = 1,
+ /// 2 tail bytes
+ Tail2 = 2,
+ /// 1 tail byte
+ Tail1 = 3,
+ /// UTF8-3 starting with E0
+ U3_2_e0 = 4,
+ /// UTF8-3 starting with ED
+ U3_2_ed = 5,
+ /// UTF8-4 starting with F0
+ Utf8_4_3_f0 = 6,
+ /// UTF8-4 starting with F4
+ Utf8_4_3_f4 = 7,
+}
+
+/// Action to take when receiving a byte
+#[allow(dead_code)]
+#[derive(Debug, Copy, Clone)]
+pub enum Action {
+ /// Unexpected byte; sequence is invalid
+ InvalidSequence = 0,
+ /// Received valid 7-bit ASCII byte which can be directly emitted.
+ EmitByte = 1,
+ /// Set the bottom continuation byte
+ SetByte1 = 2,
+ /// Set the 2nd-from-last continuation byte
+ SetByte2 = 3,
+ /// Set the 2nd-from-last byte which is part of a two byte sequence
+ SetByte2Top = 4,
+ /// Set the 3rd-from-last continuation byte
+ SetByte3 = 5,
+ /// Set the 3rd-from-last byte which is part of a three byte sequence
+ SetByte3Top = 6,
+ /// Set the top byte of a four byte sequence.
+ SetByte4 = 7,
+}
+
+/// Convert a state and action to a u8
+///
+/// State will be the bottom 4 bits and action the top 4
+#[inline]
+#[allow(dead_code)]
+pub fn pack(state: State, action: Action) -> u8 {
+ ((action as u8) << 4) | (state as u8)
+}
+
+/// Convert a u8 to a state and action
+///
+/// # Unsafety
+///
+/// If this function is called with a byte that wasn't encoded with the `pack`
+/// function in this module, there is no guarantee that a valid state and action
+/// can be produced.
+#[inline]
+pub unsafe fn unpack(val: u8) -> (State, Action) {
+ (
+ // State is stored in bottom 4 bits
+ mem::transmute(val & 0x0f),
+
+ // Action is stored in top 4 bits
+ mem::transmute(val >> 4),
+ )
+}