Add support for UTF-8

This adds a table-driven UTF-8 parser which only has a single branch for the entire parser. UTF-8 support is essentially bolted onto the VTE parser. Not the most elegant, but it does prevent the transition tables from blowing up. Instead of refactoring the syntax extension to handle both table definitions, I've opted to copy/paste now for both simplicities sake and because I can't see a clear path to a minimal shared solution.
author: Joe Wilm <joe@jwilm.com> 2016-09-17 15:51:45 -0700
committer: Joe Wilm <joe@jwilm.com> 2016-09-17 17:03:20 -0700
commit: cffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch)
tree: 26603abf607d21eefd3b9a6ac79a36dab63b5781 /src/utf8/types.rs
parent: 930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff)
download: r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz
r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2
r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip
1 files changed, 77 insertions, 0 deletions
diff --git a/src/utf8/types.rs b/src/utf8/types.rs
new file mode 100644
index 0000000..4c604f4
--- /dev/null
+++ b/src/utf8/types.rs
@@ -0,0 +1,77 @@
+//! Types supporting the UTF-8 parser
+#![allow(non_camel_case_types)]
+use std::mem;
+
+/// States the parser can be in.
+///
+/// There is a state for each initial input of the 3 and 4 byte sequences since
+/// the following bytes are subject to different conditions than a tail byte.
+#[allow(dead_code)]
+#[derive(Debug, Copy, Clone)]
+pub enum State {
+    /// Ground state; expect anything
+    Ground = 0,
+    /// 3 tail bytes
+    Tail3 = 1,
+    /// 2 tail bytes
+    Tail2 = 2,
+    /// 1 tail byte
+    Tail1 = 3,
+    /// UTF8-3 starting with E0
+    U3_2_e0 = 4,
+    /// UTF8-3 starting with ED
+    U3_2_ed = 5,
+    /// UTF8-4 starting with F0
+    Utf8_4_3_f0 = 6,
+    /// UTF8-4 starting with F4
+    Utf8_4_3_f4 = 7,
+}
+
+/// Action to take when receiving a byte
+#[allow(dead_code)]
+#[derive(Debug, Copy, Clone)]
+pub enum Action {
+    /// Unexpected byte; sequence is invalid
+    InvalidSequence = 0,
+    /// Received valid 7-bit ASCII byte which can be directly emitted.
+    EmitByte = 1,
+    /// Set the bottom continuation byte
+    SetByte1 = 2,
+    /// Set the 2nd-from-last continuation byte
+    SetByte2 = 3,
+    /// Set the 2nd-from-last byte which is part of a two byte sequence
+    SetByte2Top = 4,
+    /// Set the 3rd-from-last continuation byte
+    SetByte3 = 5,
+    /// Set the 3rd-from-last byte which is part of a three byte sequence
+    SetByte3Top = 6,
+    /// Set the top byte of a four byte sequence.
+    SetByte4 = 7,
+}
+
+/// Convert a state and action to a u8
+///
+/// State will be the bottom 4 bits and action the top 4
+#[inline]
+#[allow(dead_code)]
+pub fn pack(state: State, action: Action) -> u8 {
+    ((action as u8) << 4) | (state as u8)
+}
+
+/// Convert a u8 to a state and action
+///
+/// # Unsafety
+///
+/// If this function is called with a byte that wasn't encoded with the `pack`
+/// function in this module, there is no guarantee that a valid state and action
+/// can be produced.
+#[inline]
+pub unsafe fn unpack(val: u8) -> (State, Action) {
+    (
+        // State is stored in bottom 4 bits
+        mem::transmute(val & 0x0f),
+
+        // Action is stored in top 4 bits
+        mem::transmute(val >> 4),
+    )
+}
author	Joe Wilm <joe@jwilm.com>	2016-09-17 15:51:45 -0700
committer	Joe Wilm <joe@jwilm.com>	2016-09-17 17:03:20 -0700
commit	cffdb6de59ceb3fd9983a1c19476e5109da8db97 (patch)
tree	26603abf607d21eefd3b9a6ac79a36dab63b5781 /src/utf8/types.rs
parent	930f8cc30a5bc4943c1b56e18cf1a3f8bb00bc2a (diff)
download	r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.gz r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.tar.bz2 r-alacritty-vte-cffdb6de59ceb3fd9983a1c19476e5109da8db97.zip