diff options
Diffstat (limited to 'utf8parse')
-rw-r--r-- | utf8parse/Cargo.toml | 15 | ||||
l--------- | utf8parse/LICENSE-APACHE | 1 | ||||
l--------- | utf8parse/LICENSE-MIT | 1 | ||||
-rw-r--r-- | utf8parse/src/lib.rs | 132 | ||||
-rw-r--r-- | utf8parse/src/types.rs | 100 | ||||
-rw-r--r-- | utf8parse/tests/UTF-8-demo.txt | 212 | ||||
-rw-r--r-- | utf8parse/tests/utf-8-demo.rs | 31 |
7 files changed, 0 insertions, 492 deletions
diff --git a/utf8parse/Cargo.toml b/utf8parse/Cargo.toml deleted file mode 100644 index 71ea44b..0000000 --- a/utf8parse/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -authors = ["Joe Wilm <joe@jwilm.com>", "Christian Duerr <contact@christianduerr.com>"] -description = "Table-driven UTF-8 parser" -documentation = "https://docs.rs/utf8parse/" -repository = "https://github.com/alacritty/vte" -keywords = ["utf8", "parse", "table"] -categories = ["parsing", "no-std"] -license = "Apache-2.0 OR MIT" -version = "0.2.2" -name = "utf8parse" -edition = "2018" - -[features] -nightly = [] -default = [] diff --git a/utf8parse/LICENSE-APACHE b/utf8parse/LICENSE-APACHE deleted file mode 120000 index 965b606..0000000 --- a/utf8parse/LICENSE-APACHE +++ /dev/null @@ -1 +0,0 @@ -../LICENSE-APACHE
\ No newline at end of file diff --git a/utf8parse/LICENSE-MIT b/utf8parse/LICENSE-MIT deleted file mode 120000 index 76219eb..0000000 --- a/utf8parse/LICENSE-MIT +++ /dev/null @@ -1 +0,0 @@ -../LICENSE-MIT
\ No newline at end of file diff --git a/utf8parse/src/lib.rs b/utf8parse/src/lib.rs deleted file mode 100644 index 093de81..0000000 --- a/utf8parse/src/lib.rs +++ /dev/null @@ -1,132 +0,0 @@ -//! A table-driven UTF-8 Parser -//! -//! This module implements a table-driven UTF-8 parser which should -//! theoretically contain the minimal number of branches (1). The only branch is -//! on the `Action` returned from unpacking a transition. -#![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)] -#![cfg_attr(all(feature = "nightly", test), feature(test))] -#![no_std] - -use core::char; - -mod types; - -use types::{Action, State}; - -/// Handles codepoint and invalid sequence events from the parser. -pub trait Receiver { - /// Called whenever a codepoint is parsed successfully - fn codepoint(&mut self, _: char); - - /// Called when an invalid_sequence is detected - fn invalid_sequence(&mut self); -} - -/// A parser for Utf8 Characters -/// -/// Repeatedly call `advance` with bytes to emit Utf8 characters -#[derive(Clone, Default, PartialEq, Eq, Debug)] -pub struct Parser { - point: u32, - state: State, -} - -/// Continuation bytes are masked with this value. -const CONTINUATION_MASK: u8 = 0b0011_1111; - -impl Parser { - /// Create a new Parser - pub fn new() -> Parser { - Parser { point: 0, state: State::Ground } - } - - /// Advance the parser - /// - /// The provider receiver will be called whenever a codepoint is completed or an invalid - /// sequence is detected. - pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) - where - R: Receiver, - { - let (state, action) = self.state.advance(byte); - self.perform_action(receiver, byte, action); - self.state = state; - } - - fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) - where - R: Receiver, - { - match action { - Action::InvalidSequence => { - self.point = 0; - receiver.invalid_sequence(); - }, - Action::EmitByte => { - receiver.codepoint(byte as char); - }, - Action::SetByte1 => { - let point = self.point | ((byte & CONTINUATION_MASK) as u32); - let c = unsafe { char::from_u32_unchecked(point) }; - self.point = 0; - - receiver.codepoint(c); - }, - Action::SetByte2 => { - self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; - }, - Action::SetByte2Top => { - self.point |= ((byte & 0b0001_1111) as u32) << 6; - }, - Action::SetByte3 => { - self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; - }, - Action::SetByte3Top => { - self.point |= ((byte & 0b0000_1111) as u32) << 12; - }, - Action::SetByte4 => { - self.point |= ((byte & 0b0000_0111) as u32) << 18; - }, - } - } -} - -#[cfg(all(feature = "nightly", test))] -mod benches { - extern crate std; - extern crate test; - - use super::{Parser, Receiver}; - - use self::test::{black_box, Bencher}; - - static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt"); - - impl Receiver for () { - fn codepoint(&mut self, c: char) { - black_box(c); - } - - fn invalid_sequence(&mut self) {} - } - - #[bench] - fn parse_bench_utf8_demo(b: &mut Bencher) { - let mut parser = Parser::new(); - - b.iter(|| { - for byte in UTF8_DEMO { - parser.advance(&mut (), *byte); - } - }) - } - - #[bench] - fn std_string_parse_utf8(b: &mut Bencher) { - b.iter(|| { - for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { - black_box(c); - } - }); - } -} diff --git a/utf8parse/src/types.rs b/utf8parse/src/types.rs deleted file mode 100644 index 8a52c67..0000000 --- a/utf8parse/src/types.rs +++ /dev/null @@ -1,100 +0,0 @@ -//! Types supporting the UTF-8 parser - -/// Action to take when receiving a byte -#[derive(Debug, Copy, Clone)] -pub enum Action { - /// Unexpected byte; sequence is invalid - InvalidSequence = 0, - /// Received valid 7-bit ASCII byte which can be directly emitted. - EmitByte = 1, - /// Set the bottom continuation byte - SetByte1 = 2, - /// Set the 2nd-from-last continuation byte - SetByte2 = 3, - /// Set the 2nd-from-last byte which is part of a two byte sequence - SetByte2Top = 4, - /// Set the 3rd-from-last continuation byte - SetByte3 = 5, - /// Set the 3rd-from-last byte which is part of a three byte sequence - SetByte3Top = 6, - /// Set the top byte of a four byte sequence. - SetByte4 = 7, -} - -/// States the parser can be in. -/// -/// There is a state for each initial input of the 3 and 4 byte sequences since -/// the following bytes are subject to different conditions than a tail byte. -#[allow(non_camel_case_types)] -#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] -pub enum State { - /// Ground state; expect anything - #[default] - Ground = 0, - /// 3 tail bytes - Tail3 = 1, - /// 2 tail bytes - Tail2 = 2, - /// 1 tail byte - Tail1 = 3, - /// UTF8-3 starting with E0 - U3_2_e0 = 4, - /// UTF8-3 starting with ED - U3_2_ed = 5, - /// UTF8-4 starting with F0 - Utf8_4_3_f0 = 6, - /// UTF8-4 starting with F4 - Utf8_4_3_f4 = 7, -} - -impl State { - /// Advance the parser state. - /// - /// This takes the current state and input byte into consideration, to determine the next state - /// and any action that should be taken. - #[inline] - pub fn advance(self, byte: u8) -> (State, Action) { - match self { - State::Ground => match byte { - 0x00..=0x7f => (State::Ground, Action::EmitByte), - 0xc2..=0xdf => (State::Tail1, Action::SetByte2Top), - 0xe0 => (State::U3_2_e0, Action::SetByte3Top), - 0xe1..=0xec => (State::Tail2, Action::SetByte3Top), - 0xed => (State::U3_2_ed, Action::SetByte3Top), - 0xee..=0xef => (State::Tail2, Action::SetByte3Top), - 0xf0 => (State::Utf8_4_3_f0, Action::SetByte4), - 0xf1..=0xf3 => (State::Tail3, Action::SetByte4), - 0xf4 => (State::Utf8_4_3_f4, Action::SetByte4), - _ => (State::Ground, Action::InvalidSequence), - }, - State::U3_2_e0 => match byte { - 0xa0..=0xbf => (State::Tail1, Action::SetByte2), - _ => (State::Ground, Action::InvalidSequence), - }, - State::U3_2_ed => match byte { - 0x80..=0x9f => (State::Tail1, Action::SetByte2), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Utf8_4_3_f0 => match byte { - 0x90..=0xbf => (State::Tail2, Action::SetByte3), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Utf8_4_3_f4 => match byte { - 0x80..=0x8f => (State::Tail2, Action::SetByte3), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Tail3 => match byte { - 0x80..=0xbf => (State::Tail2, Action::SetByte3), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Tail2 => match byte { - 0x80..=0xbf => (State::Tail1, Action::SetByte2), - _ => (State::Ground, Action::InvalidSequence), - }, - State::Tail1 => match byte { - 0x80..=0xbf => (State::Ground, Action::SetByte1), - _ => (State::Ground, Action::InvalidSequence), - }, - } - } -} diff --git a/utf8parse/tests/UTF-8-demo.txt b/utf8parse/tests/UTF-8-demo.txt deleted file mode 100644 index 4363f27..0000000 --- a/utf8parse/tests/UTF-8-demo.txt +++ /dev/null @@ -1,212 +0,0 @@ - -UTF-8 encoded sample plain-text file -‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ - -Markus Kuhn [ˈmaʳkʊs kuːn] <http://www.cl.cam.ac.uk/~mgk25/> — 2002-07-25 - - -The ASCII compatible UTF-8 encoding used in this plain-text file -is defined in Unicode, ISO 10646-1, and RFC 2279. - - -Using Unicode/UTF-8, you can write in emails and source code things such as - -Mathematics and sciences: - - ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ⎧⎡⎛┌─────┐⎞⎤⎫ - ⎪⎢⎜│a²+b³ ⎟⎥⎪ - ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ⎪⎢⎜│───── ⎟⎥⎪ - ⎪⎢⎜⎷ c₈ ⎟⎥⎪ - ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⎨⎢⎜ ⎟⎥⎬ - ⎪⎢⎜ ∞ ⎟⎥⎪ - ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫), ⎪⎢⎜ ⎲ ⎟⎥⎪ - ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪ - 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm ⎩⎣⎝i=1 ⎠⎦⎭ - -Linguistics and dictionaries: - - ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn - Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ] - -APL: - - ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈ - -Nicer typography in plain text files: - - ╔══════════════════════════════════════════╗ - ║ ║ - ║ • ‘single’ and “double” quotes ║ - ║ ║ - ║ • Curly apostrophes: “We’ve been here” ║ - ║ ║ - ║ • Latin-1 apostrophe and accents: '´` ║ - ║ ║ - ║ • ‚deutsche‘ „Anführungszeichen“ ║ - ║ ║ - ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║ - ║ ║ - ║ • ASCII safety test: 1lI|, 0OD, 8B ║ - ║ ╭─────────╮ ║ - ║ • the euro symbol: │ 14.95 € │ ║ - ║ ╰─────────╯ ║ - ╚══════════════════════════════════════════╝ - -Combining characters: - - STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑ - -Greek (in Polytonic): - - The Greek anthem: - - Σὲ γνωρίζω ἀπὸ τὴν κόψη - τοῦ σπαθιοῦ τὴν τρομερή, - σὲ γνωρίζω ἀπὸ τὴν ὄψη - ποὺ μὲ βία μετράει τὴ γῆ. - - ᾿Απ᾿ τὰ κόκκαλα βγαλμένη - τῶν ῾Ελλήνων τὰ ἱερά - καὶ σὰν πρῶτα ἀνδρειωμένη - χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά! - - From a speech of Demosthenes in the 4th century BC: - - Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, - ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς - λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ - τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿ - εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ - πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν - οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι, - οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν - ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον - τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι - γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν - προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους - σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ - τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ - τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς - τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον. - - Δημοσθένους, Γ´ ᾿Ολυνθιακὸς - -Georgian: - - From a Unicode conference invitation: - - გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო - კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს, - ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს - ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი, - ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება - ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში, - ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში. - -Russian: - - From a Unicode conference invitation: - - Зарегистрируйтесь сейчас на Десятую Международную Конференцию по - Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии. - Конференция соберет широкий круг экспертов по вопросам глобального - Интернета и Unicode, локализации и интернационализации, воплощению и - применению Unicode в различных операционных системах и программных - приложениях, шрифтах, верстке и многоязычных компьютерных системах. - -Thai (UCS Level 2): - - Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese - classic 'San Gua'): - - [----------------------------|------------------------] - ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่ - สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา - ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา - โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ - เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ - ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ - พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้ - ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ - - (The above is a two-column text. If combining characters are handled - correctly, the lines of the second column should be aligned with the - | character above.) - -Ethiopian: - - Proverbs in the Amharic language: - - ሰማይ አይታረስ ንጉሥ አይከሰስ። - ብላ ካለኝ እንደአባቴ በቆመጠኝ። - ጌጥ ያለቤቱ ቁምጥና ነው። - ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው። - የአፍ ወለምታ በቅቤ አይታሽም። - አይጥ በበላ ዳዋ ተመታ። - ሲተረጉሙ ይደረግሙ። - ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል። - ድር ቢያብር አንበሳ ያስር። - ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም። - እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም። - የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ። - ሥራ ከመፍታት ልጄን ላፋታት። - ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል። - የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ። - ተንጋሎ ቢተፉ ተመልሶ ባፉ። - ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው። - እግርህን በፍራሽህ ልክ ዘርጋ። - -Runes: - - ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ - - (Old English, which transcribed into Latin reads 'He cwaeth that he - bude thaem lande northweardum with tha Westsae.' and means 'He said - that he lived in the northern land near the Western Sea.') - -Braille: - - ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌ - - ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞ - ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎ - ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂ - ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙ - ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑ - ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲ - - ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ - - ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹ - ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞ - ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕ - ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹ - ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎ - ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎ - ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳ - ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞ - ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ - - (The first couple of paragraphs of "A Christmas Carol" by Dickens) - -Compact font selection example text: - - ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 - abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ - –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд - ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა - -Greetings in various languages: - - Hello world, Καλημέρα κόσμε, コンニチハ - -Box drawing alignment tests: █ - ▉ - ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳ - ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳ - ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳ - ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳ - ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎ - ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏ - ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ ▗▄▖▛▀▜ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█ - ▝▀▘▙▄▟ diff --git a/utf8parse/tests/utf-8-demo.rs b/utf8parse/tests/utf-8-demo.rs deleted file mode 100644 index 51df492..0000000 --- a/utf8parse/tests/utf-8-demo.rs +++ /dev/null @@ -1,31 +0,0 @@ -use utf8parse::{Parser, Receiver}; - -static UTF8_DEMO: &[u8] = include_bytes!("UTF-8-demo.txt"); - -#[derive(Debug, PartialEq)] -struct StringWrapper(String); - -impl Receiver for StringWrapper { - fn codepoint(&mut self, c: char) { - self.0.push(c); - } - - fn invalid_sequence(&mut self) {} -} - -#[test] -fn utf8parse_test() { - let mut parser = Parser::new(); - - // utf8parse implementation - let mut actual = StringWrapper(String::new()); - - for byte in UTF8_DEMO { - parser.advance(&mut actual, *byte) - } - - // standard library implementation - let expected = String::from_utf8_lossy(UTF8_DEMO); - - assert_eq!(actual.0, expected); -} |