From cffdb6de59ceb3fd9983a1c19476e5109da8db97 Mon Sep 17 00:00:00 2001 From: Joe Wilm Date: Sat, 17 Sep 2016 15:51:45 -0700 Subject: Add support for UTF-8 This adds a table-driven UTF-8 parser which only has a single branch for the entire parser. UTF-8 support is essentially bolted onto the VTE parser. Not the most elegant, but it does prevent the transition tables from blowing up. Instead of refactoring the syntax extension to handle both table definitions, I've opted to copy/paste now for both simplicities sake and because I can't see a clear path to a minimal shared solution. --- src/utf8/types.rs | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/utf8/types.rs (limited to 'src/utf8/types.rs') diff --git a/src/utf8/types.rs b/src/utf8/types.rs new file mode 100644 index 0000000..4c604f4 --- /dev/null +++ b/src/utf8/types.rs @@ -0,0 +1,77 @@ +//! Types supporting the UTF-8 parser +#![allow(non_camel_case_types)] +use std::mem; + +/// States the parser can be in. +/// +/// There is a state for each initial input of the 3 and 4 byte sequences since +/// the following bytes are subject to different conditions than a tail byte. +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum State { + /// Ground state; expect anything + Ground = 0, + /// 3 tail bytes + Tail3 = 1, + /// 2 tail bytes + Tail2 = 2, + /// 1 tail byte + Tail1 = 3, + /// UTF8-3 starting with E0 + U3_2_e0 = 4, + /// UTF8-3 starting with ED + U3_2_ed = 5, + /// UTF8-4 starting with F0 + Utf8_4_3_f0 = 6, + /// UTF8-4 starting with F4 + Utf8_4_3_f4 = 7, +} + +/// Action to take when receiving a byte +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +pub enum Action { + /// Unexpected byte; sequence is invalid + InvalidSequence = 0, + /// Received valid 7-bit ASCII byte which can be directly emitted. + EmitByte = 1, + /// Set the bottom continuation byte + SetByte1 = 2, + /// Set the 2nd-from-last continuation byte + SetByte2 = 3, + /// Set the 2nd-from-last byte which is part of a two byte sequence + SetByte2Top = 4, + /// Set the 3rd-from-last continuation byte + SetByte3 = 5, + /// Set the 3rd-from-last byte which is part of a three byte sequence + SetByte3Top = 6, + /// Set the top byte of a four byte sequence. + SetByte4 = 7, +} + +/// Convert a state and action to a u8 +/// +/// State will be the bottom 4 bits and action the top 4 +#[inline] +#[allow(dead_code)] +pub fn pack(state: State, action: Action) -> u8 { + ((action as u8) << 4) | (state as u8) +} + +/// Convert a u8 to a state and action +/// +/// # Unsafety +/// +/// If this function is called with a byte that wasn't encoded with the `pack` +/// function in this module, there is no guarantee that a valid state and action +/// can be produced. +#[inline] +pub unsafe fn unpack(val: u8) -> (State, Action) { + ( + // State is stored in bottom 4 bits + mem::transmute(val & 0x0f), + + // Action is stored in top 4 bits + mem::transmute(val >> 4), + ) +} -- cgit