Add support for UTF-8

This adds a table-driven UTF-8 parser which only has a single branch for the entire parser. UTF-8 support is essentially bolted onto the VTE parser. Not the most elegant, but it does prevent the transition tables from blowing up. Instead of refactoring the syntax extension to handle both table definitions, I've opted to copy/paste now for both simplicities sake and because I can't see a clear path to a minimal shared solution.
author: Joe Wilm <joe@jwilm.com> 2016-09-17 15:51:45 -0700
committer: Joe Wilm <joe@jwilm.com> 2016-09-17 17:03:20 -0700
commit: cffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch)
tree: 26603abf607d21eefd3b9a6ac79a36dab63b5781 /src/utf8/mod.rs
parent: 930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff)
download: r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz
r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2
r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip
1 files changed, 91 insertions, 0 deletions
diff --git a/src/utf8/mod.rs b/src/utf8/mod.rs
new file mode 100644
index 0000000..3d099b1
--- /dev/null
+++ b/src/utf8/mod.rs
@@ -0,0 +1,91 @@
+//! A table-driven UTF-8 Parser
+//!
+//! This module implements a table-driven UTF-8 parser which should
+//! theoretically contain the minimal number of branches (1). The only branch is
+//! on the `Action` returned from unpacking a transition.
+use std::char;
+
+mod types;
+use self::types::{State, Action, unpack};
+
+mod table;
+use self::table::TRANSITIONS;
+
+/// Handles codepoint and invalid sequence events from the parser.
+pub trait Receiver {
+    /// Code point parsed
+    ///
+    /// Called with the codepoint
+    fn codepoint(&mut self, char);
+
+    /// Invalid sequence encountered
+    fn invalid_sequence(&mut self);
+}
+
+/// A parser for Utf8 Characters
+///
+/// Repeatedly call `advance` with bytes to emit Utf8 characters
+pub struct Parser {
+    point: u32,
+    state: State,
+}
+
+/// Continuation bytes are masked with this value.
+const CONTINUATION_MASK: u8 = 0b0011_1111;
+
+impl Parser {
+    /// Create a new Parser
+    pub fn new() -> Parser {
+        Parser {
+            point: 0,
+            state: State::Ground,
+        }
+    }
+
+    pub fn advance<R>(&mut self, receiver: &mut R, byte: u8)
+        where R: Receiver
+    {
+        let cur = self.state as usize;
+        let change = TRANSITIONS[cur][byte as usize];
+        let (state, action) = unsafe { unpack(change) };
+
+        self.perform_action(receiver, byte, action);
+        self.state = state;
+    }
+
+    fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action)
+        where R: Receiver
+    {
+        match action {
+            Action::InvalidSequence => {
+                self.point = 0;
+                receiver.invalid_sequence();
+            },
+            Action::EmitByte => {
+                receiver.codepoint(byte as char);
+            },
+            Action::SetByte1 => {
+                let point = self.point | ((byte & CONTINUATION_MASK) as u32);
+                let c = unsafe { char::from_u32_unchecked(point) };
+                self.point = 0;
+
+                receiver.codepoint(c);
+            },
+            Action::SetByte2 => {
+                self.point |= ((byte & CONTINUATION_MASK) as u32) << 6;
+            },
+            Action::SetByte2Top => {
+                self.point |= ((byte & 0b0001_1111) as u32) << 6;
+            },
+            Action::SetByte3 => {
+                self.point |= ((byte & CONTINUATION_MASK) as u32) << 12;
+            },
+            Action::SetByte3Top => {
+                self.point |= ((byte & 0b0000_1111) as u32) << 12;
+            },
+            Action::SetByte4 => {
+                self.point |= ((byte & 0b0000_0111) as u32) << 18;
+            },
+        }
+    }
+}
author	Joe Wilm <joe@jwilm.com>	2016-09-17 15:51:45 -0700
committer	Joe Wilm <joe@jwilm.com>	2016-09-17 17:03:20 -0700
commit	cffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch)
tree	26603abf607d21eefd3b9a6ac79a36dab63b5781 /src/utf8/mod.rs
parent	930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff)
download	r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2 r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip