diff options
Diffstat (limited to 'src/io.rs')
-rw-r--r-- | src/io.rs | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 00000000..688e72a4 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,194 @@ +// Copyright 2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Unmerged utf8 chars iterator vendored from std::io +//! +use std::io::{BufRead, ErrorKind, Error}; +use std::fmt; +use std::error as std_error; +use std::result; +use std::char; + +static UTF8_CHAR_WIDTH: [u8; 256] = [ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF +0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF +3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF +4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF +]; + +/// Given a first byte, determine how many bytes are in this UTF-8 character +#[inline] +pub fn utf8_char_width(b: u8) -> usize { + return UTF8_CHAR_WIDTH[b as usize] as usize; +} + +/// An iterator over the `char`s of a reader. +/// +/// This struct is generally created by calling [`utf8_chars()`][utf8_chars] on a reader. +/// Please see the documentation of `utf8_chars()` for more details. +/// +/// [utf8_chars]: trait.BufRead.html#method.utf8_chars +pub struct Utf8Chars<R> { + inner: R, +} + +impl<R> Utf8Chars<R> { + pub fn new(inner: R) -> Utf8Chars<R> { + Utf8Chars { inner: inner } + } +} + +/// An enumeration of possible errors that can be generated from the `Utf8Chars` +/// adapter. +#[derive(Debug)] +pub enum Utf8CharsError { + /// Variant representing that the underlying stream was read successfully + /// but contains a byte sequence ill-formed in UTF-8. + InvalidUtf8, + + /// Variant representing that the underlying stream contains the start + /// of a byte sequence well-formed in UTF-8, but ends prematurely. + /// + /// Contains number of unused bytes + IncompleteUtf8(u8), + + /// Variant representing that an I/O error occurred. + Io(Error), +} + +impl<R: BufRead> Iterator for Utf8Chars<R> { + type Item = result::Result<char, Utf8CharsError>; + + // allow(unused_assignments) because consumed += 1 is not recognized as being used + #[allow(unused_assignments)] + fn next(&mut self) -> Option<result::Result<char, Utf8CharsError>> { + macro_rules! read_byte { + (EOF => $on_eof: expr) => { + { + let byte; + loop { + match self.inner.fill_buf() { + Ok(buffer) => { + if let Some(&b) = buffer.first() { + byte = b; + break + } else { + $on_eof + } + } + Err(ref e) if e.kind() == ErrorKind::Interrupted => {} + Err(e) => return Some(Err(Utf8CharsError::Io(e))), + } + } + byte + } + } + } + + let first = read_byte!(EOF => return None); + self.inner.consume(1); + + let mut consumed = 1; + + macro_rules! continuation_byte { + ($range: pat) => { + { + match read_byte!(EOF => return Some(Err(Utf8CharsError::IncompleteUtf8(consumed)))) { + byte @ $range => { + self.inner.consume(1); + consumed += 1; + (byte & 0b0011_1111) as u32 + } + _ => return Some(Err(Utf8CharsError::InvalidUtf8)) + } + } + } + } + + // Ranges can be checked against https://tools.ietf.org/html/rfc3629#section-4 + let code_point = match utf8_char_width(first) { + 1 => return Some(Ok(first as char)), + 2 => { + let second = continuation_byte!(0x80...0xBF); + ((first & 0b0001_1111) as u32) << 6 | second + } + 3 => { + let second = match first { + 0xE0 => continuation_byte!(0xA0...0xBF), + 0xE1...0xEC => continuation_byte!(0x80...0xBF), + 0xED => continuation_byte!(0x80...0x9F), + 0xEE...0xEF => continuation_byte!(0x80...0xBF), + _ => unreachable!(), + }; + let third = continuation_byte!(0x80...0xBF); + ((first & 0b0000_1111) as u32) << 12 | second << 6 | third + } + 4 => { + let second = match first { + 0xF0 => continuation_byte!(0x90...0xBF), + 0xF0...0xF3 => continuation_byte!(0x80...0xBF), + 0xF4 => continuation_byte!(0x80...0x8F), + _ => unreachable!(), + }; + let third = continuation_byte!(0x80...0xBF); + let fourth = continuation_byte!(0x80...0xBF); + ((first & 0b0000_0111) as u32) << 18 | second << 12 | third << 6 | fourth + } + _ => return Some(Err(Utf8CharsError::InvalidUtf8)) + }; + unsafe { + Some(Ok(char::from_u32_unchecked(code_point))) + } + } +} + +impl std_error::Error for Utf8CharsError { + fn description(&self) -> &str { + match *self { + Utf8CharsError::InvalidUtf8 => "invalid UTF-8 byte sequence", + Utf8CharsError::IncompleteUtf8(_) => { + "stream ended in the middle of an UTF-8 byte sequence" + } + Utf8CharsError::Io(ref e) => std_error::Error::description(e), + } + } + fn cause(&self) -> Option<&std_error::Error> { + match *self { + Utf8CharsError::InvalidUtf8 | Utf8CharsError::IncompleteUtf8(_) => None, + Utf8CharsError::Io(ref e) => e.cause(), + } + } +} + +impl fmt::Display for Utf8CharsError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + Utf8CharsError::InvalidUtf8 => { + "invalid UTF-8 byte sequence".fmt(f) + } + Utf8CharsError::IncompleteUtf8(_) => { + "stream ended in the middle of an UTF-8 byte sequence".fmt(f) + } + Utf8CharsError::Io(ref e) => e.fmt(f), + } + } +} |