diff options
author | Christian Duerr <contact@christianduerr.com> | 2025-01-09 06:27:15 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-01-09 06:27:15 +0000 |
commit | 7321a442a6fc0fc5b6d6ed7af364477d25e706fd (patch) | |
tree | 11ff2608e63a160b8b204b6f78ec3977f019d081 /src/lib.rs | |
parent | 89c12df969145ffb5084d1122627d7292c2c638f (diff) | |
download | r-alacritty-vte-7321a442a6fc0fc5b6d6ed7af364477d25e706fd.tar.gz r-alacritty-vte-7321a442a6fc0fc5b6d6ed7af364477d25e706fd.tar.bz2 r-alacritty-vte-7321a442a6fc0fc5b6d6ed7af364477d25e706fd.zip |
Switch parser to multi-byte processing
This patch overhauls the `Parser::advance` API to operate on byte slices
instead of individual bytes, which allows for additional performance
optimizations.
VTE does not support C1 escapes and C0 escapes always start with an
escape character. This makes it possible to simplify processing if a
byte stream is determined to not contain any escapes. The `memchr` crate
provides a battle-tested implementation for SIMD-accelerated byte
searches, which is why this implementation makes use of it.
VTE also only supports UTF8 characters in the ground state, which means
that the new non-escape parsing path is able to rely completely on STD's
`str::from_utf8` since `memchr` gives us the full length of the plain
text character buffer. This allows us to completely remove `utf8parse`
and all related code.
We also make use of `memchr` in the synchronized escape handling in
`ansi.rs`, since it relies heavily on scanning large amounts of text for
the extension/termination escape sequences.
Diffstat (limited to 'src/lib.rs')
-rw-r--r-- | src/lib.rs | 910 |
1 files changed, 546 insertions, 364 deletions
@@ -1,44 +1,39 @@ //! Parser for implementing virtual terminal emulators //! -//! [`Parser`] is implemented according to [Paul Williams' ANSI parser -//! state machine]. The state machine doesn't assign meaning to the parsed data -//! and is thus not itself sufficient for writing a terminal emulator. Instead, -//! it is expected that an implementation of [`Perform`] is provided which does +//! [`Parser`] is implemented according to [Paul Williams' ANSI parser state +//! machine]. The state machine doesn't assign meaning to the parsed data and is +//! thus not itself sufficient for writing a terminal emulator. Instead, it is +//! expected that an implementation of [`Perform`] is provided which does //! something useful with the parsed data. The [`Parser`] handles the book //! keeping, and the [`Perform`] gets to simply handle actions. //! //! # Examples //! -//! For an example of using the [`Parser`] please see the examples folder. The example included -//! there simply logs all the actions [`Perform`] does. One quick thing to see it in action is to -//! pipe `vim` into it +//! For an example of using the [`Parser`] please see the examples folder. The +//! example included there simply logs all the actions [`Perform`] does. One +//! quick way to see it in action is to pipe `printf` into it //! //! ```sh -//! cargo build --release --example parselog -//! vim | target/release/examples/parselog +//! printf '\x1b[31mExample' | cargo run --example parselog //! ``` //! -//! Just type `:q` to exit. -//! //! # Differences from original state machine description //! //! * UTF-8 Support for Input //! * OSC Strings can be terminated by 0x07 -//! * Only supports 7-bit codes. Some 8-bit codes are still supported, but they no longer work in -//! all states. +//! * Only supports 7-bit codes //! //! [`Parser`]: struct.Parser.html //! [`Perform`]: trait.Perform.html //! [Paul Williams' ANSI parser state machine]: https://vt100.net/emu/dec_ansi_parser #![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)] -#![cfg_attr(all(feature = "nightly", test), feature(test))] #![cfg_attr(feature = "no_std", no_std)] use core::mem::MaybeUninit; +use core::str; #[cfg(feature = "no_std")] use arrayvec::ArrayVec; -use utf8parse as utf8; mod definitions; mod params; @@ -46,28 +41,13 @@ mod table; #[cfg(feature = "ansi")] pub mod ansi; -pub use params::{Params, ParamsIter}; - use definitions::{unpack, Action, State}; +pub use params::{Params, ParamsIter}; const MAX_INTERMEDIATES: usize = 2; const MAX_OSC_PARAMS: usize = 16; const MAX_OSC_RAW: usize = 1024; -struct VtUtf8Receiver<'a, P: Perform>(&'a mut P, &'a mut State); - -impl<P: Perform> utf8::Receiver for VtUtf8Receiver<'_, P> { - fn codepoint(&mut self, c: char) { - self.0.print(c); - *self.1 = State::Ground; - } - - fn invalid_sequence(&mut self) { - self.0.print('�'); - *self.1 = State::Ground; - } -} - /// Parser for raw _VTE_ protocol which delegates actions to a [`Perform`] /// /// [`Perform`]: trait.Perform.html @@ -88,7 +68,8 @@ pub struct Parser<const OSC_RAW_BUF_SIZE: usize = MAX_OSC_RAW> { osc_params: [(usize, usize); MAX_OSC_PARAMS], osc_num_params: usize, ignoring: bool, - utf8_parser: utf8::Parser, + partial_utf8: [u8; 4], + partial_utf8_len: usize, } impl Parser { @@ -99,7 +80,8 @@ impl Parser { } impl<const OSC_RAW_BUF_SIZE: usize> Parser<OSC_RAW_BUF_SIZE> { - /// Create a new Parser with a custom size for the Operating System Command buffer. + /// Create a new Parser with a custom size for the Operating System Command + /// buffer. /// /// Call with a const-generic param on `Parser`, like: /// @@ -121,41 +103,74 @@ impl<const OSC_RAW_BUF_SIZE: usize> Parser<OSC_RAW_BUF_SIZE> { &self.intermediates[..self.intermediate_idx] } - /// Advance the parser state + /// Advance the parser state. /// - /// Requires a [`Perform`] in case `byte` triggers an action + /// Requires a [`Perform`] implementation to handle the triggered actions. /// /// [`Perform`]: trait.Perform.html #[inline] - pub fn advance<P: Perform>(&mut self, performer: &mut P, byte: u8) { - // Utf8 characters are handled out-of-band. - if let State::Utf8 = self.state { - self.process_utf8(performer, byte); - return; - } - - // Handle state changes in the anywhere state before evaluating changes - // for current state. - let mut change = table::STATE_CHANGES[State::Anywhere as usize][byte as usize]; + pub fn advance<P: Perform>(&mut self, performer: &mut P, bytes: &[u8]) { + let mut i = 0; - if change == 0 { - change = table::STATE_CHANGES[self.state as usize][byte as usize]; + // Handle partial codepoints from previous calls to `advance`. + if self.partial_utf8_len > 0 { + i += self.advance_partial_utf8(performer, bytes); } - // Unpack into a state and action - let (state, action) = unpack(change); + while i != bytes.len() { + match self.state { + State::Ground => i += self.advance_ground(performer, &bytes[i..]), + _ => { + let byte = bytes[i]; + let change = table::STATE_CHANGES[self.state as usize][byte as usize]; + let (state, action) = unpack(change); - self.perform_state_change(performer, state, action, byte); + self.perform_state_change(performer, state, action, byte); + + i += 1; + }, + } + } } + /// Partially advance the parser state. + /// + /// This is equivalent to [`Self::advance`], but stops when + /// [`Perform::terminated`] is true after reading a byte. + /// + /// Returns the number of bytes read before termination. + /// + /// See [`Perform::advance`] for more details. #[inline] - fn process_utf8<P>(&mut self, performer: &mut P, byte: u8) - where - P: Perform, - { - let mut receiver = VtUtf8Receiver(performer, &mut self.state); - let utf8_parser = &mut self.utf8_parser; - utf8_parser.advance(&mut receiver, byte); + #[must_use = "Returned value should be used to processs the remaining bytes"] + pub fn advance_until_terminated<P: Perform>( + &mut self, + performer: &mut P, + bytes: &[u8], + ) -> usize { + let mut i = 0; + + // Handle partial codepoints from previous calls to `advance`. + if self.partial_utf8_len != 0 { + i += self.advance_partial_utf8(performer, bytes); + } + + while i != bytes.len() && !performer.terminated() { + match self.state { + State::Ground => i += self.advance_ground(performer, &bytes[i..]), + _ => { + let byte = bytes[i]; + let change = table::STATE_CHANGES[self.state as usize][byte as usize]; + let (state, action) = unpack(change); + + self.perform_state_change(performer, state, action, byte); + + i += 1; + }, + } + } + + i } #[inline] @@ -163,93 +178,75 @@ impl<const OSC_RAW_BUF_SIZE: usize> Parser<OSC_RAW_BUF_SIZE> { where P: Perform, { - macro_rules! maybe_action { - ($action:expr, $arg:expr) => { - match $action { - Action::None => (), - action => { - self.perform_action(performer, action, $arg); - }, - } - }; + if state == State::Anywhere { + self.perform_action(performer, action, byte); + return; } - match state { - State::Anywhere => { - // Just run the action - self.perform_action(performer, action, byte); - }, - state => { - match self.state { - State::DcsPassthrough => { - self.perform_action(performer, Action::Unhook, byte); - }, - State::OscString => { - self.perform_action(performer, Action::OscEnd, byte); - }, - _ => (), - } + match self.state { + State::DcsPassthrough => performer.unhook(), + State::OscString => { + let param_idx = self.osc_num_params; + let idx = self.osc_raw.len(); - maybe_action!(action, byte); + match param_idx { + // Finish last parameter if not already maxed + MAX_OSC_PARAMS => (), - match state { - State::CsiEntry | State::DcsEntry | State::Escape => { - self.perform_action(performer, Action::Clear, byte); - }, - State::DcsPassthrough => { - self.perform_action(performer, Action::Hook, byte); + // First param is special - 0 to current byte index + 0 => { + self.osc_params[param_idx] = (0, idx); + self.osc_num_params += 1; }, - State::OscString => { - self.perform_action(performer, Action::OscStart, byte); + + // All other params depend on previous indexing + _ => { + let prev = self.osc_params[param_idx - 1]; + let begin = prev.1; + self.osc_params[param_idx] = (begin, idx); + self.osc_num_params += 1; }, - _ => (), } - - // Assume the new state - self.state = state; + self.osc_dispatch(performer, byte); }, + _ => (), } - } - /// Separate method for osc_dispatch that borrows self as read-only - /// - /// The aliasing is needed here for multiple slices into self.osc_raw - #[inline] - fn osc_dispatch<P: Perform>(&self, performer: &mut P, byte: u8) { - let mut slices: [MaybeUninit<&[u8]>; MAX_OSC_PARAMS] = - unsafe { MaybeUninit::uninit().assume_init() }; + if action == Action::None { + match state { + State::CsiEntry | State::DcsEntry | State::Escape => self.reset_params(), + State::DcsPassthrough => { + if self.params.is_full() { + self.ignoring = true; + } else { + self.params.push(self.param); + } - for (i, slice) in slices.iter_mut().enumerate().take(self.osc_num_params) { - let indices = self.osc_params[i]; - *slice = MaybeUninit::new(&self.osc_raw[indices.0..indices.1]); + performer.hook( + self.params(), + self.intermediates(), + self.ignoring, + byte as char, + ); + }, + State::OscString => { + self.osc_raw.clear(); + self.osc_num_params = 0; + }, + _ => (), + } + } else { + self.perform_action(performer, action, byte); } - unsafe { - let num_params = self.osc_num_params; - let params = &slices[..num_params] as *const [MaybeUninit<&[u8]>] as *const [&[u8]]; - performer.osc_dispatch(&*params, byte == 0x07); - } + self.state = state; } #[inline] fn perform_action<P: Perform>(&mut self, performer: &mut P, action: Action, byte: u8) { match action { - Action::Print => performer.print(byte as char), Action::Execute => performer.execute(byte), - Action::Hook => { - if self.params.is_full() { - self.ignoring = true; - } else { - self.params.push(self.param); - } - - performer.hook(self.params(), self.intermediates(), self.ignoring, byte as char); - }, Action::Put => performer.put(byte), - Action::OscStart => { - self.osc_raw.clear(); - self.osc_num_params = 0; - }, Action::OscPut => { #[cfg(feature = "no_std")] { @@ -285,31 +282,6 @@ impl<const OSC_RAW_BUF_SIZE: usize> Parser<OSC_RAW_BUF_SIZE> { self.osc_raw.push(byte); } }, - Action::OscEnd => { - let param_idx = self.osc_num_params; - let idx = self.osc_raw.len(); - - match param_idx { - // Finish last parameter if not already maxed - MAX_OSC_PARAMS => (), - - // First param is special - 0 to current byte index - 0 => { - self.osc_params[param_idx] = (0, idx); - self.osc_num_params += 1; - }, - - // All other params depend on previous indexing - _ => { - let prev = self.osc_params[param_idx - 1]; - let begin = prev.1; - self.osc_params[param_idx] = (begin, idx); - self.osc_num_params += 1; - }, - } - self.osc_dispatch(performer, byte); - }, - Action::Unhook => performer.unhook(), Action::CsiDispatch => { if self.params.is_full() { self.ignoring = true; @@ -341,37 +313,203 @@ impl<const OSC_RAW_BUF_SIZE: usize> Parser<OSC_RAW_BUF_SIZE> { return; } - if byte == b';' { - self.params.push(self.param); - self.param = 0; - } else if byte == b':' { - self.params.extend(self.param); - self.param = 0; - } else { - // Continue collecting bytes into param - self.param = self.param.saturating_mul(10); - self.param = self.param.saturating_add((byte - b'0') as u16); + match byte { + b';' => { + self.params.push(self.param); + self.param = 0; + }, + b':' => { + self.params.extend(self.param); + self.param = 0; + }, + _ => { + // Continue collecting bytes into param + self.param = self.param.saturating_mul(10); + self.param = self.param.saturating_add((byte - b'0') as u16); + }, } }, - Action::Clear => { - // Reset everything on ESC/CSI/DCS entry - self.intermediate_idx = 0; - self.ignoring = false; - self.param = 0; + _ => (), + } + } + + /// Reset escape sequence parameters and intermediates. + #[inline] + fn reset_params(&mut self) { + self.intermediate_idx = 0; + self.ignoring = false; + self.param = 0; + + self.params.clear(); + } + + /// Separate method for osc_dispatch that borrows self as read-only + /// + /// The aliasing is needed here for multiple slices into self.osc_raw + #[inline] + fn osc_dispatch<P: Perform>(&self, performer: &mut P, byte: u8) { + let mut slices: [MaybeUninit<&[u8]>; MAX_OSC_PARAMS] = + unsafe { MaybeUninit::uninit().assume_init() }; + + for (i, slice) in slices.iter_mut().enumerate().take(self.osc_num_params) { + let indices = self.osc_params[i]; + *slice = MaybeUninit::new(&self.osc_raw[indices.0..indices.1]); + } + + unsafe { + let num_params = self.osc_num_params; + let params = &slices[..num_params] as *const [MaybeUninit<&[u8]>] as *const [&[u8]]; + performer.osc_dispatch(&*params, byte == 0x07); + } + } - self.params.clear(); + /// Advance the parser state from ground. + /// + /// The ground state is handled separately since it can only be left using + /// the escape character (`\x1b`). This allows more efficient parsing by + /// using SIMD search with [`memchr`]. + #[inline] + fn advance_ground<P: Perform>(&mut self, performer: &mut P, bytes: &[u8]) -> usize { + // Find the next escape character. + let num_bytes = bytes.len(); + let plain_chars = memchr::memchr(0x1B, bytes).unwrap_or(num_bytes); + + // If the next character is ESC, just process it and short-circuit. + if plain_chars == 0 { + self.state = State::Escape; + self.reset_params(); + return 1; + } + + match str::from_utf8(&bytes[..plain_chars]) { + Ok(parsed) => { + Self::ground_dispatch(performer, parsed); + let mut processed = plain_chars; + + // If there's another character, it must be escape so process it directly. + if processed < num_bytes { + self.state = State::Escape; + self.reset_params(); + processed += 1; + } + + processed + }, + // Handle invalid and partial utf8. + Err(err) => { + // Dispatch all the valid bytes. + let valid_bytes = err.valid_up_to(); + let parsed = unsafe { str::from_utf8_unchecked(&bytes[..valid_bytes]) }; + Self::ground_dispatch(performer, parsed); + + match err.error_len() { + Some(len) => { + // Execute C1 escapes or emit replacement character. + if len == 1 && bytes[valid_bytes] <= 0x9F { + performer.execute(bytes[valid_bytes]); + } else { + performer.print('�'); + } + + // Restart processing after the invalid bytes. + // + // While we could theoretically try to just re-parse + // `bytes[valid_bytes + len..plain_chars]`, it's easier + // to just skip it and invalid utf8 is pretty rare anyway. + valid_bytes + len + }, + None => { + if plain_chars < num_bytes { + // Process bytes cut off by escape. + performer.print('�'); + self.state = State::Escape; + self.reset_params(); + plain_chars + 1 + } else { + // Process bytes cut off by the buffer end. + let extra_bytes = num_bytes - valid_bytes; + let partial_len = self.partial_utf8_len + extra_bytes; + self.partial_utf8[self.partial_utf8_len..partial_len] + .copy_from_slice(&bytes[valid_bytes..valid_bytes + extra_bytes]); + self.partial_utf8_len = partial_len; + num_bytes + } + }, + } + }, + } + } + + /// Advance the parser while processing a partial utf8 codepoint. + #[inline] + fn advance_partial_utf8<P: Perform>(&mut self, performer: &mut P, bytes: &[u8]) -> usize { + // Try to copy up to 3 more characters, to ensure the codepoint is complete. + let old_bytes = self.partial_utf8_len; + let to_copy = bytes.len().min(self.partial_utf8.len() - old_bytes); + self.partial_utf8[old_bytes..old_bytes + to_copy].copy_from_slice(&bytes[..to_copy]); + self.partial_utf8_len += to_copy; + + // Parse the unicode character. + match str::from_utf8(&self.partial_utf8[..self.partial_utf8_len]) { + // If the entire buffer is valid, use the first character and continue parsing. + Ok(parsed) => { + let c = unsafe { parsed.chars().next().unwrap_unchecked() }; + performer.print(c); + + self.partial_utf8_len = 0; + c.len_utf8() - old_bytes + }, + Err(err) => { + match err.error_len() { + // If the partial character was also invalid, emit the replacement + // character. + Some(invalid_len) => { + performer.print('�'); + + self.partial_utf8_len = 0; + invalid_len - old_bytes + }, + None => { + // If we have any valid bytes, that means we partially copied another + // utf8 character into `partial_utf8`. Since we only care about the + // first character, we just ignore the rest. + let valid_bytes = err.valid_up_to(); + if valid_bytes > 0 { + let c = unsafe { + let parsed = + str::from_utf8_unchecked(&self.partial_utf8[..valid_bytes]); + parsed.chars().next().unwrap_unchecked() + }; + performer.print(c); + + self.partial_utf8_len = 0; + valid_bytes - old_bytes + } else { + // If the character still isn't complete, wait for more data. + bytes.len() + } + }, + } }, - Action::BeginUtf8 => self.process_utf8(performer, byte), - Action::Ignore => (), - Action::None => (), + } + } + + /// Handle ground dispatch of print/execute for all characters in a string. + #[inline] + fn ground_dispatch<P: Perform>(performer: &mut P, text: &str) { + for c in text.chars() { + match c { + '\x00'..='\x1f' | '\u{80}'..='\u{9f}' => performer.execute(c as u8), + _ => performer.print(c), + } } } } /// Performs actions requested by the Parser /// -/// Actions in this case mean, for example, handling a CSI escape sequence describing cursor -/// movement, or simply printing characters to the screen. +/// Actions in this case mean, for example, handling a CSI escape sequence +/// describing cursor movement, or simply printing characters to the screen. /// /// The methods on this type correspond to actions described in /// <http://vt100.net/emu/dec_ansi_parser>. I've done my best to describe them in @@ -385,19 +523,21 @@ pub trait Perform { /// Execute a C0 or C1 control function. fn execute(&mut self, _byte: u8) {} - /// Invoked when a final character arrives in first part of device control string. + /// Invoked when a final character arrives in first part of device control + /// string. /// - /// The control function should be determined from the private marker, final character, and - /// execute with a parameter list. A handler should be selected for remaining characters in the - /// string; the handler function should subsequently be called by `put` for every character in + /// The control function should be determined from the private marker, final + /// character, and execute with a parameter list. A handler should be + /// selected for remaining characters in the string; the handler + /// function should subsequently be called by `put` for every character in /// the control string. /// /// The `ignore` flag indicates that more than two intermediates arrived and /// subsequent characters were ignored. fn hook(&mut self, _params: &Params, _intermediates: &[u8], _ignore: bool, _action: char) {} - /// Pass bytes as part of a device control string to the handle chosen in `hook`. C0 controls - /// will also be passed to the handler. + /// Pass bytes as part of a device control string to the handle chosen in + /// `hook`. C0 controls will also be passed to the handler. fn put(&mut self, _byte: u8) {} /// Called when a device control string is terminated. @@ -411,9 +551,9 @@ pub trait Perform { /// A final character has arrived for a CSI sequence /// - /// The `ignore` flag indicates that either more than two intermediates arrived - /// or the number of parameters exceeded the maximum supported length, - /// and subsequent characters were ignored. + /// The `ignore` flag indicates that either more than two intermediates + /// arrived or the number of parameters exceeded the maximum supported + /// length, and subsequent characters were ignored. fn csi_dispatch( &mut self, _params: &Params, @@ -428,6 +568,19 @@ pub trait Perform { /// The `ignore` flag indicates that more than two intermediates arrived and /// subsequent characters were ignored. fn esc_dispatch(&mut self, _intermediates: &[u8], _ignore: bool, _byte: u8) {} + + /// Whether the parser should terminate prematurely. + /// + /// This can be used in conjunction with + /// [`Parser::advance_until_terminated`] to terminate the parser after + /// receiving certain escape sequences like synchronized updates. + /// + /// This is checked after every parsed byte, so no expensive computation + /// should take place in this function. + #[inline(always)] + fn terminated(&self) -> bool { + false + } } #[cfg(all(test, feature = "no_std"))] @@ -436,12 +589,12 @@ extern crate std; #[cfg(test)] mod tests { - use super::*; - use std::vec::Vec; - static OSC_BYTES: &[u8] = &[ - 0x1b, 0x5d, // Begin OSC + use super::*; + + const OSC_BYTES: &[u8] = &[ + 0x1B, 0x5D, // Begin OSC b'2', b';', b'j', b'w', b'i', b'l', b'm', b'@', b'j', b'w', b'i', b'l', b'm', b'-', b'd', b'e', b's', b'k', b':', b' ', b'~', b'/', b'c', b'o', b'd', b'e', b'/', b'a', b'l', b'a', b'c', b'r', b'i', b't', b't', b'y', 0x07, // End OSC @@ -459,6 +612,8 @@ mod tests { Esc(Vec<u8>, bool, u8), DcsHook(Vec<Vec<u16>>, Vec<u8>, bool, char), DcsPut(u8), + Print(char), + Execute(u8), DcsUnhook, } @@ -492,6 +647,14 @@ mod tests { fn unhook(&mut self) { self.dispatched.push(Sequence::DcsUnhook); } + + fn print(&mut self, c: char) { + self.dispatched.push(Sequence::Print(c)); + } + + fn execute(&mut self, byte: u8) { + self.dispatched.push(Sequence::Execute(byte)); + } } #[test] @@ -499,9 +662,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in OSC_BYTES { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, OSC_BYTES); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -519,9 +680,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in &[0x1b, 0x5d, 0x07] { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, &[0x1B, 0x5D, 0x07]); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -537,9 +696,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in input { - parser.advance(&mut dispatcher, byte); - } + parser.advance(&mut dispatcher, &input); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -553,13 +710,11 @@ mod tests { #[test] fn osc_bell_terminated() { - static INPUT: &[u8] = b"\x1b]11;ff/00/ff\x07"; + const INPUT: &[u8] = b"\x1b]11;ff/00/ff\x07"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -570,13 +725,11 @@ mod tests { #[test] fn osc_c0_st_terminated() { - static INPUT: &[u8] = b"\x1b]11;ff/00/ff\x1b\\"; + const INPUT: &[u8] = b"\x1b]11;ff/00/ff\x1b\\"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 2); match &dispatcher.dispatched[0] { @@ -587,37 +740,29 @@ mod tests { #[test] fn parse_osc_with_utf8_arguments() { - static INPUT: &[u8] = &[ - 0x0d, 0x1b, 0x5d, 0x32, 0x3b, 0x65, 0x63, 0x68, 0x6f, 0x20, 0x27, 0xc2, 0xaf, 0x5c, - 0x5f, 0x28, 0xe3, 0x83, 0x84, 0x29, 0x5f, 0x2f, 0xc2, 0xaf, 0x27, 0x20, 0x26, 0x26, - 0x20, 0x73, 0x6c, 0x65, 0x65, 0x70, 0x20, 0x31, 0x07, + const INPUT: &[u8] = &[ + 0x0D, 0x1B, 0x5D, 0x32, 0x3B, 0x65, 0x63, 0x68, 0x6F, 0x20, 0x27, 0xC2, 0xAF, 0x5C, + 0x5F, 0x28, 0xE3, 0x83, 0x84, 0x29, 0x5F, 0x2F, 0xC2, 0xAF, 0x27, 0x20, 0x26, 0x26, + 0x20, 0x73, 0x6C, 0x65, 0x65, 0x70, 0x20, 0x31, 0x07, ]; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); - assert_eq!(dispatcher.dispatched.len(), 1); - match &dispatcher.dispatched[0] { - Sequence::Osc(params, _) => { - assert_eq!(params[0], &[b'2']); - assert_eq!(params[1], &INPUT[5..(INPUT.len() - 1)]); - }, - _ => panic!("expected osc sequence"), - } + assert_eq!(dispatcher.dispatched[0], Sequence::Execute(b'\r')); + let osc_data = INPUT[5..(INPUT.len() - 1)].into(); + assert_eq!(dispatcher.dispatched[1], Sequence::Osc(vec![vec![b'2'], osc_data], true)); + assert_eq!(dispatcher.dispatched.len(), 2); } #[test] fn osc_containing_string_terminator() { - static INPUT: &[u8] = b"\x1b]2;\xe6\x9c\xab\x1b\\"; + const INPUT: &[u8] = b"\x1b]2;\xe6\x9c\xab\x1b\\"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 2); match &dispatcher.dispatched[0] { @@ -630,27 +775,21 @@ mod tests { #[test] fn exceed_max_buffer_size() { - static NUM_BYTES: usize = MAX_OSC_RAW + 100; - static INPUT_START: &[u8] = &[0x1b, b']', b'5', b'2', b';', b's']; - static INPUT_END: &[u8] = &[b'\x07']; + const NUM_BYTES: usize = MAX_OSC_RAW + 100; + const INPUT_START: &[u8] = b"\x1b]52;s"; + const INPUT_END: &[u8] = b"\x07"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); // Create valid OSC escape - for byte in INPUT_START { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT_START); // Exceed max buffer size - for _ in 0..NUM_BYTES { - parser.advance(&mut dispatcher, b'a'); - } + parser.advance(&mut dispatcher, &[b'a'; NUM_BYTES]); // Terminate escape for dispatch - for byte in INPUT_END { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT_END); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -679,9 +818,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in input { - parser.advance(&mut dispatcher, byte); - } + parser.advance(&mut dispatcher, &input); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -704,9 +841,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in input { - parser.advance(&mut dispatcher, byte); - } + parser.advance(&mut dispatcher, &input); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -723,9 +858,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in b"\x1b[4;m" { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, b"\x1b[4;m"); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -740,9 +873,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in b"\x1b[;4m" { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, b"\x1b[;4m"); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -754,35 +885,31 @@ mod tests { #[test] fn parse_long_csi_param() { // The important part is the parameter, which is (i64::MAX + 1) - static INPUT: &[u8] = b"\x1b[9223372036854775808m"; + const INPUT: &[u8] = b"\x1b[9223372036854775808m"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { - Sequence::Csi(params, ..) => assert_eq!(params, &[[std::u16::MAX as u16]]), + Sequence::Csi(params, ..) => assert_eq!(params, &[[u16::MAX]]), _ => panic!("expected csi sequence"), } } #[test] fn csi_reset() { - static INPUT: &[u8] = b"\x1b[3;1\x1b[?1049h"; + const INPUT: &[u8] = b"\x1b[3;1\x1b[?1049h"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { Sequence::Csi(params, intermediates, ignore, _) => { - assert_eq!(intermediates, &[b'?']); + assert_eq!(intermediates, b"?"); assert_eq!(params, &[[1049]]); assert!(!ignore); }, @@ -792,13 +919,11 @@ mod tests { #[test] fn csi_subparameters() { - static INPUT: &[u8] = b"\x1b[38:2:255:0:255;1m"; + const INPUT: &[u8] = b"\x1b[38:2:255:0:255;1m"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -818,9 +943,7 @@ mod tests { let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in input { - parser.advance(&mut dispatcher, byte); - } + parser.advance(&mut dispatcher, &input); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -835,18 +958,16 @@ mod tests { #[test] fn dcs_reset() { - static INPUT: &[u8] = b"\x1b[3;1\x1bP1$tx\x9c"; + const INPUT: &[u8] = b"\x1b[3;1\x1bP1$tx\x9c"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 3); match &dispatcher.dispatched[0] { Sequence::DcsHook(params, intermediates, ignore, _) => { - assert_eq!(intermediates, &[b'$']); + assert_eq!(intermediates, b"$"); assert_eq!(params, &[[1]]); assert!(!ignore); }, @@ -858,13 +979,11 @@ mod tests { #[test] fn parse_dcs() { - static INPUT: &[u8] = b"\x1bP0;1|17/ab\x9c"; + const INPUT: &[u8] = b"\x1bP0;1|17/ab\x9c"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 7); match &dispatcher.dispatched[0] { @@ -882,35 +1001,31 @@ mod tests { #[test] fn intermediate_reset_on_dcs_exit() { - static INPUT: &[u8] = b"\x1bP=1sZZZ\x1b+\x5c"; + const INPUT: &[u8] = b"\x1bP=1sZZZ\x1b+\x5c"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 6); match &dispatcher.dispatched[5] { - Sequence::Esc(intermediates, ..) => assert_eq!(intermediates, &[b'+']), + Sequence::Esc(intermediates, ..) => assert_eq!(intermediates, b"+"), _ => panic!("expected esc sequence"), } } #[test] fn esc_reset() { - static INPUT: &[u8] = b"\x1b[3;1\x1b(A"; + const INPUT: &[u8] = b"\x1b[3;1\x1b(A"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { Sequence::Esc(intermediates, ignore, byte) => { - assert_eq!(intermediates, &[b'(']); + assert_eq!(intermediates, b"("); assert_eq!(*byte, b'A'); assert!(!ignore); }, @@ -919,14 +1034,25 @@ mod tests { } #[test] + fn esc_reset_intermediates() { + const INPUT: &[u8] = b"\x1b[?2004l\x1b#8"; + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); + + parser.advance(&mut dispatcher, INPUT); + + assert_eq!(dispatcher.dispatched.len(), 2); + assert_eq!(dispatcher.dispatched[0], Sequence::Csi(vec![vec![2004]], vec![63], false, 'l')); + assert_eq!(dispatcher.dispatched[1], Sequence::Esc(vec![35], false, 56)); + } + + #[test] fn params_buffer_filled_with_subparam() { - static INPUT: &[u8] = b"\x1b[::::::::::::::::::::::::::::::::x\x1b"; + const INPUT: &[u8] = b"\x1b[::::::::::::::::::::::::::::::::x\x1b"; let mut dispatcher = Dispatcher::default(); let mut parser = Parser::new(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -943,18 +1069,16 @@ mod tests { #[cfg(feature = "no_std")] #[test] fn build_with_fixed_size() { - static INPUT: &[u8] = b"\x1b[3;1\x1b[?1049h"; + const INPUT: &[u8] = b"\x1b[3;1\x1b[?1049h"; let mut dispatcher = Dispatcher::default(); let mut parser: Parser<30> = Parser::new_with_size(); - for byte in INPUT { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { Sequence::Csi(params, intermediates, ignore, _) => { - assert_eq!(intermediates, &[b'?']); + assert_eq!(intermediates, b"?"); assert_eq!(params, &[[1049]]); assert!(!ignore); }, @@ -966,27 +1090,21 @@ mod tests { #[test] fn exceed_fixed_osc_buffer_size() { const OSC_BUFFER_SIZE: usize = 32; - static NUM_BYTES: usize = OSC_BUFFER_SIZE + 100; - static INPUT_START: &[u8] = b"\x1b]52;"; - static INPUT_END: &[u8] = b"\x07"; + const NUM_BYTES: usize = OSC_BUFFER_SIZE + 100; + const INPUT_START: &[u8] = b"\x1b]52;"; + const INPUT_END: &[u8] = b"\x07"; let mut dispatcher = Dispatcher::default(); let mut parser: Parser<OSC_BUFFER_SIZE> = Parser::new_with_size(); // Create valid OSC escape - for byte in INPUT_START { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT_START); // Exceed max buffer size - for _ in 0..NUM_BYTES { - parser.advance(&mut dispatcher, b'a'); - } + parser.advance(&mut dispatcher, &[b'a'; NUM_BYTES]); // Terminate escape for dispatch - for byte in INPUT_END { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT_END); assert_eq!(dispatcher.dispatched.len(), 1); match &dispatcher.dispatched[0] { @@ -1005,22 +1123,16 @@ mod tests { #[cfg(feature = "no_std")] #[test] fn fixed_size_osc_containing_string_terminator() { - static INPUT_START: &[u8] = b"\x1b]2;"; - static INPUT_MIDDLE: &[u8] = b"s\xe6\x9c\xab"; - static INPUT_END: &[u8] = b"\x1b\\"; + const INPUT_START: &[u8] = b"\x1b]2;"; + const INPUT_MIDDLE: &[u8] = b"s\xe6\x9c\xab"; + const INPUT_END: &[u8] = b"\x1b\\"; let mut dispatcher = Dispatcher::default(); let mut parser: Parser<5> = Parser::new_with_size(); - for byte in INPUT_START { - parser.advance(&mut dispatcher, *byte); - } - for byte in INPUT_MIDDLE { - parser.advance(&mut dispatcher, *byte); - } - for byte in INPUT_END { - parser.advance(&mut dispatcher, *byte); - } + parser.advance(&mut dispatcher, INPUT_START); + parser.advance(&mut dispatcher, INPUT_MIDDLE); + parser.advance(&mut dispatcher, INPUT_END); assert_eq!(dispatcher.dispatched.len(), 2); match &dispatcher.dispatched[0] { @@ -1031,74 +1143,144 @@ mod tests { _ => panic!("expected osc sequence"), } } -} -#[cfg(all(feature = "nightly", test))] -mod bench { - extern crate std; - extern crate test; + #[test] + fn unicode() { + const INPUT: &[u8] = b"\xF0\x9F\x8E\x89_\xF0\x9F\xA6\x80\xF0\x9F\xA6\x80_\xF0\x9F\x8E\x89"; - use super::*; + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); - use test::{black_box, Bencher}; + parser.advance(&mut dispatcher, INPUT); - static VTE_DEMO: &[u8] = include_bytes!("../tests/demo.vte"); + assert_eq!(dispatcher.dispatched.len(), 6); + assert_eq!(dispatcher.dispatched[0], Sequence::Print('🎉')); + assert_eq!(dispatcher.dispatched[1], Sequence::Print('_')); + assert_eq!(dispatcher.dispatched[2], Sequence::Print('🦀')); + assert_eq!(dispatcher.dispatched[3], Sequence::Print('🦀')); + assert_eq!(dispatcher.dispatched[4], Sequence::Print('_')); + assert_eq!(dispatcher.dispatched[5], Sequence::Print('🎉')); + } - struct BenchDispatcher; - impl Perform for BenchDispatcher { - fn print(&mut self, c: char) { - black_box(c); - } + #[test] + fn invalid_utf8() { + const INPUT: &[u8] = b"a\xEF\xBCb"; - fn execute(&mut self, byte: u8) { - black_box(byte); - } + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); - fn hook(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: char) { - black_box((params, intermediates, ignore, c)); - } + parser.advance(&mut dispatcher, INPUT); - fn put(&mut self, byte: u8) { - black_box(byte); - } + assert_eq!(dispatcher.dispatched.len(), 3); + assert_eq!(dispatcher.dispatched[0], Sequence::Print('a')); + assert_eq!(dispatcher.dispatched[1], Sequence::Print('�')); + assert_eq!(dispatcher.dispatched[2], Sequence::Print('b')); + } - fn osc_dispatch(&mut self, params: &[&[u8]], bell_terminated: bool) { - black_box((params, bell_terminated)); - } + #[test] + fn partial_utf8() { + const INPUT: &[u8] = b"\xF0\x9F\x9A\x80"; - fn csi_dispatch(&mut self, params: &Params, intermediates: &[u8], ignore: bool, c: char) { - black_box((params, intermediates, ignore, c)); - } + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); - fn esc_dispatch(&mut self, intermediates: &[u8], ignore: bool, byte: u8) { - black_box((intermediates, ignore, byte)); - } + parser.advance(&mut dispatcher, &INPUT[..1]); + parser.advance(&mut dispatcher, &INPUT[1..2]); + parser.advance(&mut dispatcher, &INPUT[2..3]); + parser.advance(&mut dispatcher, &INPUT[3..]); + + assert_eq!(dispatcher.dispatched.len(), 1); + assert_eq!(dispatcher.dispatched[0], Sequence::Print('🚀')); } - #[bench] - fn testfile(b: &mut Bencher) { - b.iter(|| { - let mut dispatcher = BenchDispatcher; - let mut parser = Parser::new(); + #[test] + fn partial_utf8_separating_utf8() { + // This is different from the `partial_utf8` test since it has a multi-byte UTF8 + // character after the partial UTF8 state, causing a partial byte to be present + // in the `partial_utf8` buffer after the 2-byte codepoint. - for byte in VTE_DEMO { - parser.advance(&mut dispatcher, *byte); - } - }); + // "ĸ🎉" + const INPUT: &[u8] = b"\xC4\xB8\xF0\x9F\x8E\x89"; + + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); + + parser.advance(&mut dispatcher, &INPUT[..1]); + parser.advance(&mut dispatcher, &INPUT[1..]); + + assert_eq!(dispatcher.dispatched.len(), 2); + assert_eq!(dispatcher.dispatched[0], Sequence::Print('ĸ')); + assert_eq!(dispatcher.dispatched[1], Sequence::Print('🎉')); } - #[bench] - fn state_changes(b: &mut Bencher) { - let input = b"\x1b]2;X\x1b\\ \x1b[0m \x1bP0@\x1b\\"; - b.iter(|| { - let mut dispatcher = BenchDispatcher; - let mut parser = Parser::new(); + #[test] + fn partial_invalid_utf8() { + const INPUT: &[u8] = b"a\xEF\xBCb"; - for _ in 0..1_000 { - for byte in input { - parser.advance(&mut dispatcher, *byte); - } - } - }); + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); + + parser.advance(&mut dispatcher, &INPUT[..1]); + parser.advance(&mut dispatcher, &INPUT[1..2]); + parser.advance(&mut dispatcher, &INPUT[2..3]); + parser.advance(&mut dispatcher, &INPUT[3..]); + + assert_eq!(dispatcher.dispatched.len(), 3); + assert_eq!(dispatcher.dispatched[0], Sequence::Print('a')); + assert_eq!(dispatcher.dispatched[1], Sequence::Print('�')); + assert_eq!(dispatcher.dispatched[2], Sequence::Print('b')); + } + + #[test] + fn partial_utf8_into_esc() { + const INPUT: &[u8] = b"\xD8\x1b012"; + + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); + + parser.advance(&mut dispatcher, INPUT); + + assert_eq!(dispatcher.dispatched.len(), 4); + assert_eq!(dispatcher.dispatched[0], Sequence::Print('�')); + assert_eq!(dispatcher.dispatched[1], Sequence::Esc(Vec::new(), false, b'0')); + assert_eq!(dispatcher.dispatched[2], Sequence::Print('1')); + assert_eq!(dispatcher.dispatched[3], Sequence::Print('2')); + } + + #[test] + fn c1s() { + const INPUT: &[u8] = b"\x00\x1f\x80\x90\x98\x9b\x9c\x9d\x9e\x9fa"; + + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); + + parser.advance(&mut dispatcher, INPUT); + + assert_eq!(dispatcher.dispatched.len(), 11); + assert_eq!(dispatcher.dispatched[0], Sequence::Execute(0)); + assert_eq!(dispatcher.dispatched[1], Sequence::Execute(31)); + assert_eq!(dispatcher.dispatched[2], Sequence::Execute(128)); + assert_eq!(dispatcher.dispatched[3], Sequence::Execute(144)); + assert_eq!(dispatcher.dispatched[4], Sequence::Execute(152)); + assert_eq!(dispatcher.dispatched[5], Sequence::Execute(155)); + assert_eq!(dispatcher.dispatched[6], Sequence::Execute(156)); + assert_eq!(dispatcher.dispatched[7], Sequence::Execute(157)); + assert_eq!(dispatcher.dispatched[8], Sequence::Execute(158)); + assert_eq!(dispatcher.dispatched[9], Sequence::Execute(159)); + assert_eq!(dispatcher.dispatched[10], Sequence::Print('a')); + } + + #[test] + fn execute_anywhere() { + const INPUT: &[u8] = b"\x18\x1a"; + + let mut dispatcher = Dispatcher::default(); + let mut parser = Parser::new(); + + parser.advance(&mut dispatcher, INPUT); + + assert_eq!(dispatcher.dispatched.len(), 2); + assert_eq!(dispatcher.dispatched[0], Sequence::Execute(0x18)); + assert_eq!(dispatcher.dispatched[1], Sequence::Execute(0x1A)); } } |