diff --git a/library/core/src/ascii.rs b/library/core/src/ascii.rs index 8a4cb78cc7f..065f1b3e70e 100644 --- a/library/core/src/ascii.rs +++ b/library/core/src/ascii.rs @@ -9,10 +9,10 @@ #![stable(feature = "core_ascii", since = "1.26.0")] +use crate::escape; use crate::fmt; use crate::iter::FusedIterator; -use crate::ops::Range; -use crate::str::from_utf8_unchecked; +use crate::num::NonZeroUsize; /// An iterator over the escaped version of a byte. /// @@ -21,10 +21,7 @@ use crate::str::from_utf8_unchecked; #[must_use = "iterators are lazy and do nothing unless consumed"] #[stable(feature = "rust1", since = "1.0.0")] #[derive(Clone)] -pub struct EscapeDefault { - range: Range, - data: [u8; 4], -} +pub struct EscapeDefault(escape::EscapeIterInner<4>); /// Returns an iterator that produces an escaped version of a `u8`. /// @@ -90,21 +87,9 @@ pub struct EscapeDefault { /// ``` #[stable(feature = "rust1", since = "1.0.0")] pub fn escape_default(c: u8) -> EscapeDefault { - let (data, len) = match c { - b'\t' => ([b'\\', b't', 0, 0], 2), - b'\r' => ([b'\\', b'r', 0, 0], 2), - b'\n' => ([b'\\', b'n', 0, 0], 2), - b'\\' => ([b'\\', b'\\', 0, 0], 2), - b'\'' => ([b'\\', b'\'', 0, 0], 2), - b'"' => ([b'\\', b'"', 0, 0], 2), - b'\x20'..=b'\x7e' => ([c, 0, 0, 0], 1), - _ => { - let hex_digits: &[u8; 16] = b"0123456789abcdef"; - ([b'\\', b'x', hex_digits[(c >> 4) as usize], hex_digits[(c & 0xf) as usize]], 4) - } - }; - - return EscapeDefault { range: 0..len, data }; + let mut data = [0; 4]; + let range = escape::escape_ascii_into(&mut data, c); + EscapeDefault(escape::EscapeIterInner::new(data, range)) } #[stable(feature = "rust1", since = "1.0.0")] @@ -113,33 +98,59 @@ impl Iterator for EscapeDefault { #[inline] fn next(&mut self) -> Option { - self.range.next().map(|i| self.data[i as usize]) + self.0.next() } + + #[inline] fn size_hint(&self) -> (usize, Option) { - self.range.size_hint() + let n = self.0.len(); + (n, Some(n)) } + + #[inline] + fn count(self) -> usize { + self.0.len() + } + + #[inline] fn last(mut self) -> Option { - self.next_back() + self.0.next_back() + } + + #[inline] + fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { + self.0.advance_by(n) } } + #[stable(feature = "rust1", since = "1.0.0")] impl DoubleEndedIterator for EscapeDefault { + #[inline] fn next_back(&mut self) -> Option { - self.range.next_back().map(|i| self.data[i as usize]) + self.0.next_back() + } + + #[inline] + fn advance_back_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { + self.0.advance_back_by(n) } } + #[stable(feature = "rust1", since = "1.0.0")] -impl ExactSizeIterator for EscapeDefault {} +impl ExactSizeIterator for EscapeDefault { + #[inline] + fn len(&self) -> usize { + self.0.len() + } +} + #[stable(feature = "fused", since = "1.26.0")] impl FusedIterator for EscapeDefault {} #[stable(feature = "ascii_escape_display", since = "1.39.0")] impl fmt::Display for EscapeDefault { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // SAFETY: ok because `escape_default` created only valid utf-8 data - f.write_str(unsafe { - from_utf8_unchecked(&self.data[(self.range.start as usize)..(self.range.end as usize)]) - }) + f.write_str(self.0.as_str()) } } diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 9bc97ea0bff..2408f178075 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -380,20 +380,7 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn escape_unicode(self) -> EscapeUnicode { - let c = self as u32; - - // or-ing 1 ensures that for c==0 the code computes that one - // digit should be printed and (which is the same) avoids the - // (31 - 32) underflow - let msb = 31 - (c | 1).leading_zeros(); - - // the index of the most significant hex digit - let ms_hex_digit = msb / 4; - EscapeUnicode { - c: self, - state: EscapeUnicodeState::Backslash, - hex_digit_idx: ms_hex_digit as usize, - } + EscapeUnicode::new(self) } /// An extended version of `escape_debug` that optionally permits escaping @@ -403,21 +390,20 @@ impl char { /// characters, and double quotes in strings. #[inline] pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug { - let init_state = match self { - '\0' => EscapeDefaultState::Backslash('0'), - '\t' => EscapeDefaultState::Backslash('t'), - '\r' => EscapeDefaultState::Backslash('r'), - '\n' => EscapeDefaultState::Backslash('n'), - '\\' => EscapeDefaultState::Backslash(self), - '"' if args.escape_double_quote => EscapeDefaultState::Backslash(self), - '\'' if args.escape_single_quote => EscapeDefaultState::Backslash(self), + match self { + '\0' => EscapeDebug::backslash(b'0'), + '\t' => EscapeDebug::backslash(b't'), + '\r' => EscapeDebug::backslash(b'r'), + '\n' => EscapeDebug::backslash(b'n'), + '\\' => EscapeDebug::backslash(b'\\'), + '"' if args.escape_double_quote => EscapeDebug::backslash(b'"'), + '\'' if args.escape_single_quote => EscapeDebug::backslash(b'\''), _ if args.escape_grapheme_extended && self.is_grapheme_extended() => { - EscapeDefaultState::Unicode(self.escape_unicode()) + EscapeDebug::from_unicode(self.escape_unicode()) } - _ if is_printable(self) => EscapeDefaultState::Char(self), - _ => EscapeDefaultState::Unicode(self.escape_unicode()), - }; - EscapeDebug(EscapeDefault { state: init_state }) + _ if is_printable(self) => EscapeDebug::printable(self), + _ => EscapeDebug::from_unicode(self.escape_unicode()), + } } /// Returns an iterator that yields the literal escape code of a character @@ -515,15 +501,14 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn escape_default(self) -> EscapeDefault { - let init_state = match self { - '\t' => EscapeDefaultState::Backslash('t'), - '\r' => EscapeDefaultState::Backslash('r'), - '\n' => EscapeDefaultState::Backslash('n'), - '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), - '\x20'..='\x7e' => EscapeDefaultState::Char(self), - _ => EscapeDefaultState::Unicode(self.escape_unicode()), - }; - EscapeDefault { state: init_state } + match self { + '\t' => EscapeDefault::backslash(b't'), + '\r' => EscapeDefault::backslash(b'r'), + '\n' => EscapeDefault::backslash(b'n'), + '\\' | '\'' | '"' => EscapeDefault::backslash(self as u8), + '\x20'..='\x7e' => EscapeDefault::printable(self as u8), + _ => EscapeDefault::from_unicode(self.escape_unicode()), + } } /// Returns the number of bytes this `char` would need if encoded in UTF-8. diff --git a/library/core/src/char/mod.rs b/library/core/src/char/mod.rs index 8ec78e88733..e186db7052c 100644 --- a/library/core/src/char/mod.rs +++ b/library/core/src/char/mod.rs @@ -39,8 +39,10 @@ pub use self::methods::encode_utf16_raw; pub use self::methods::encode_utf8_raw; use crate::error::Error; +use crate::escape; use crate::fmt::{self, Write}; use crate::iter::FusedIterator; +use crate::num::NonZeroUsize; pub(crate) use self::methods::EscapeDebugExtArgs; @@ -146,86 +148,44 @@ pub const fn from_digit(num: u32, radix: u32) -> Option { /// [`escape_unicode`]: char::escape_unicode #[derive(Clone, Debug)] #[stable(feature = "rust1", since = "1.0.0")] -pub struct EscapeUnicode { - c: char, - state: EscapeUnicodeState, +pub struct EscapeUnicode(escape::EscapeIterInner<10>); - // The index of the next hex digit to be printed (0 if none), - // i.e., the number of remaining hex digits to be printed; - // increasing from the least significant digit: 0x543210 - hex_digit_idx: usize, -} - -// The enum values are ordered so that their representation is the -// same as the remaining length (besides the hexadecimal digits). This -// likely makes `len()` a single load from memory) and inline-worth. -#[derive(Clone, Debug)] -enum EscapeUnicodeState { - Done, - RightBrace, - Value, - LeftBrace, - Type, - Backslash, +impl EscapeUnicode { + fn new(chr: char) -> Self { + let mut data = [0; 10]; + let range = escape::escape_unicode_into(&mut data, chr); + Self(escape::EscapeIterInner::new(data, range)) + } } #[stable(feature = "rust1", since = "1.0.0")] impl Iterator for EscapeUnicode { type Item = char; + #[inline] fn next(&mut self) -> Option { - match self.state { - EscapeUnicodeState::Backslash => { - self.state = EscapeUnicodeState::Type; - Some('\\') - } - EscapeUnicodeState::Type => { - self.state = EscapeUnicodeState::LeftBrace; - Some('u') - } - EscapeUnicodeState::LeftBrace => { - self.state = EscapeUnicodeState::Value; - Some('{') - } - EscapeUnicodeState::Value => { - let hex_digit = ((self.c as u32) >> (self.hex_digit_idx * 4)) & 0xf; - let c = char::from_digit(hex_digit, 16).unwrap(); - if self.hex_digit_idx == 0 { - self.state = EscapeUnicodeState::RightBrace; - } else { - self.hex_digit_idx -= 1; - } - Some(c) - } - EscapeUnicodeState::RightBrace => { - self.state = EscapeUnicodeState::Done; - Some('}') - } - EscapeUnicodeState::Done => None, - } + self.0.next().map(char::from) } #[inline] fn size_hint(&self) -> (usize, Option) { - let n = self.len(); + let n = self.0.len(); (n, Some(n)) } #[inline] fn count(self) -> usize { - self.len() + self.0.len() } - fn last(self) -> Option { - match self.state { - EscapeUnicodeState::Done => None, + #[inline] + fn last(mut self) -> Option { + self.0.next_back().map(char::from) + } - EscapeUnicodeState::RightBrace - | EscapeUnicodeState::Value - | EscapeUnicodeState::LeftBrace - | EscapeUnicodeState::Type - | EscapeUnicodeState::Backslash => Some('}'), - } + #[inline] + fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { + self.0.advance_by(n) } } @@ -233,16 +193,7 @@ impl Iterator for EscapeUnicode { impl ExactSizeIterator for EscapeUnicode { #[inline] fn len(&self) -> usize { - // The match is a single memory access with no branching - self.hex_digit_idx - + match self.state { - EscapeUnicodeState::Done => 0, - EscapeUnicodeState::RightBrace => 1, - EscapeUnicodeState::Value => 2, - EscapeUnicodeState::LeftBrace => 3, - EscapeUnicodeState::Type => 4, - EscapeUnicodeState::Backslash => 5, - } + self.0.len() } } @@ -252,10 +203,7 @@ impl FusedIterator for EscapeUnicode {} #[stable(feature = "char_struct_display", since = "1.16.0")] impl fmt::Display for EscapeUnicode { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for c in self.clone() { - f.write_char(c)?; - } - Ok(()) + f.write_str(self.0.as_str()) } } @@ -267,90 +215,60 @@ impl fmt::Display for EscapeUnicode { /// [`escape_default`]: char::escape_default #[derive(Clone, Debug)] #[stable(feature = "rust1", since = "1.0.0")] -pub struct EscapeDefault { - state: EscapeDefaultState, -} +pub struct EscapeDefault(escape::EscapeIterInner<10>); -#[derive(Clone, Debug)] -enum EscapeDefaultState { - Done, - Char(char), - Backslash(char), - Unicode(EscapeUnicode), +impl EscapeDefault { + fn printable(chr: u8) -> Self { + let data = [chr, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + Self(escape::EscapeIterInner::new(data, 0..1)) + } + + fn backslash(chr: u8) -> Self { + let data = [b'\\', chr, 0, 0, 0, 0, 0, 0, 0, 0]; + Self(escape::EscapeIterInner::new(data, 0..2)) + } + + fn from_unicode(esc: EscapeUnicode) -> Self { + Self(esc.0) + } } #[stable(feature = "rust1", since = "1.0.0")] impl Iterator for EscapeDefault { type Item = char; + #[inline] fn next(&mut self) -> Option { - match self.state { - EscapeDefaultState::Backslash(c) => { - self.state = EscapeDefaultState::Char(c); - Some('\\') - } - EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done; - Some(c) - } - EscapeDefaultState::Done => None, - EscapeDefaultState::Unicode(ref mut iter) => iter.next(), - } + self.0.next().map(char::from) } #[inline] fn size_hint(&self) -> (usize, Option) { - let n = self.len(); + let n = self.0.len(); (n, Some(n)) } #[inline] fn count(self) -> usize { - self.len() + self.0.len() } - fn nth(&mut self, n: usize) -> Option { - match self.state { - EscapeDefaultState::Backslash(c) if n == 0 => { - self.state = EscapeDefaultState::Char(c); - Some('\\') - } - EscapeDefaultState::Backslash(c) if n == 1 => { - self.state = EscapeDefaultState::Done; - Some(c) - } - EscapeDefaultState::Backslash(_) => { - self.state = EscapeDefaultState::Done; - None - } - EscapeDefaultState::Char(c) => { - self.state = EscapeDefaultState::Done; - - if n == 0 { Some(c) } else { None } - } - EscapeDefaultState::Done => None, - EscapeDefaultState::Unicode(ref mut i) => i.nth(n), - } + #[inline] + fn last(mut self) -> Option { + self.0.next_back().map(char::from) } - fn last(self) -> Option { - match self.state { - EscapeDefaultState::Unicode(iter) => iter.last(), - EscapeDefaultState::Done => None, - EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c), - } + #[inline] + fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { + self.0.advance_by(n) } } #[stable(feature = "exact_size_escape", since = "1.11.0")] impl ExactSizeIterator for EscapeDefault { + #[inline] fn len(&self) -> usize { - match self.state { - EscapeDefaultState::Done => 0, - EscapeDefaultState::Char(_) => 1, - EscapeDefaultState::Backslash(_) => 2, - EscapeDefaultState::Unicode(ref iter) => iter.len(), - } + self.0.len() } } @@ -360,10 +278,7 @@ impl FusedIterator for EscapeDefault {} #[stable(feature = "char_struct_display", since = "1.16.0")] impl fmt::Display for EscapeDefault { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for c in self.clone() { - f.write_char(c)?; - } - Ok(()) + f.write_str(self.0.as_str()) } } @@ -375,21 +290,74 @@ impl fmt::Display for EscapeDefault { /// [`escape_debug`]: char::escape_debug #[stable(feature = "char_escape_debug", since = "1.20.0")] #[derive(Clone, Debug)] -pub struct EscapeDebug(EscapeDefault); +pub struct EscapeDebug(EscapeDebugInner); -#[stable(feature = "char_escape_debug", since = "1.20.0")] -impl Iterator for EscapeDebug { - type Item = char; - fn next(&mut self) -> Option { - self.0.next() +#[derive(Clone, Debug)] +// Note: It’s possible to manually encode the EscapeDebugInner inside of +// EscapeIterInner (e.g. with alive=254..255 indicating that data[0..4] holds +// a char) which would likely result in a more optimised code. For now we use +// the option easier to implement. +enum EscapeDebugInner { + Bytes(escape::EscapeIterInner<10>), + Char(char), +} + +impl EscapeDebug { + fn printable(chr: char) -> Self { + Self(EscapeDebugInner::Char(chr)) } - fn size_hint(&self) -> (usize, Option) { - self.0.size_hint() + + fn backslash(chr: u8) -> Self { + let data = [b'\\', chr, 0, 0, 0, 0, 0, 0, 0, 0]; + let iter = escape::EscapeIterInner::new(data, 0..2); + Self(EscapeDebugInner::Bytes(iter)) + } + + fn from_unicode(esc: EscapeUnicode) -> Self { + Self(EscapeDebugInner::Bytes(esc.0)) + } + + fn clear(&mut self) { + let bytes = escape::EscapeIterInner::new([0; 10], 0..0); + self.0 = EscapeDebugInner::Bytes(bytes); } } #[stable(feature = "char_escape_debug", since = "1.20.0")] -impl ExactSizeIterator for EscapeDebug {} +impl Iterator for EscapeDebug { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + match self.0 { + EscapeDebugInner::Bytes(ref mut bytes) => bytes.next().map(char::from), + EscapeDebugInner::Char(chr) => { + self.clear(); + Some(chr) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let n = self.len(); + (n, Some(n)) + } + + #[inline] + fn count(self) -> usize { + self.len() + } +} + +#[stable(feature = "char_escape_debug", since = "1.20.0")] +impl ExactSizeIterator for EscapeDebug { + fn len(&self) -> usize { + match &self.0 { + EscapeDebugInner::Bytes(bytes) => bytes.len(), + EscapeDebugInner::Char(_) => 1, + } + } +} #[stable(feature = "fused", since = "1.26.0")] impl FusedIterator for EscapeDebug {} @@ -397,7 +365,10 @@ impl FusedIterator for EscapeDebug {} #[stable(feature = "char_escape_debug", since = "1.20.0")] impl fmt::Display for EscapeDebug { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(&self.0, f) + match &self.0 { + EscapeDebugInner::Bytes(bytes) => f.write_str(bytes.as_str()), + EscapeDebugInner::Char(chr) => f.write_char(*chr), + } } } diff --git a/library/core/src/escape.rs b/library/core/src/escape.rs new file mode 100644 index 00000000000..20ac3cf027f --- /dev/null +++ b/library/core/src/escape.rs @@ -0,0 +1,99 @@ +//! Helper code for character escaping. + +use crate::num::NonZeroUsize; +use crate::ops::Range; + +const HEX_DIGITS: [u8; 16] = *b"0123456789abcdef"; + +/// Escapes a byte into provided buffer; returns length of escaped +/// representation. +pub(crate) fn escape_ascii_into(output: &mut [u8; 4], byte: u8) -> Range { + let (data, len) = match byte { + b'\t' => ([b'\\', b't', 0, 0], 2), + b'\r' => ([b'\\', b'r', 0, 0], 2), + b'\n' => ([b'\\', b'n', 0, 0], 2), + b'\\' => ([b'\\', b'\\', 0, 0], 2), + b'\'' => ([b'\\', b'\'', 0, 0], 2), + b'"' => ([b'\\', b'"', 0, 0], 2), + b'\x20'..=b'\x7e' => ([byte, 0, 0, 0], 1), + _ => { + let hi = HEX_DIGITS[usize::from(byte >> 4)]; + let lo = HEX_DIGITS[usize::from(byte & 0xf)]; + ([b'\\', b'x', hi, lo], 4) + } + }; + *output = data; + 0..(len as u8) +} + +/// Escapes a character into provided buffer using `\u{NNNN}` representation. +pub(crate) fn escape_unicode_into(output: &mut [u8; 10], ch: char) -> Range { + output[9] = b'}'; + + let ch = ch as u32; + output[3] = HEX_DIGITS[((ch >> 20) & 15) as usize]; + output[4] = HEX_DIGITS[((ch >> 16) & 15) as usize]; + output[5] = HEX_DIGITS[((ch >> 12) & 15) as usize]; + output[6] = HEX_DIGITS[((ch >> 8) & 15) as usize]; + output[7] = HEX_DIGITS[((ch >> 4) & 15) as usize]; + output[8] = HEX_DIGITS[((ch >> 0) & 15) as usize]; + + // or-ing 1 ensures that for ch==0 the code computes that one digit should + // be printed. + let start = (ch | 1).leading_zeros() as usize / 4 - 2; + output[start..start + 3].copy_from_slice(b"\\u{"); + + (start as u8)..10 +} + +/// An iterator over an fixed-size array. +/// +/// This is essentially equivalent to array’s IntoIter except that indexes are +/// limited to u8 to reduce size of the structure. +#[derive(Clone, Debug)] +pub(crate) struct EscapeIterInner { + // Invariant: data[alive] is all ASCII. + pub(crate) data: [u8; N], + + // Invariant: alive.start <= alive.end <= N. + pub(crate) alive: Range, +} + +impl EscapeIterInner { + pub fn new(data: [u8; N], alive: Range) -> Self { + const { assert!(N < 256) }; + debug_assert!(alive.start <= alive.end && usize::from(alive.end) <= N, "{alive:?}"); + let this = Self { data, alive }; + debug_assert!(this.as_bytes().is_ascii(), "Expected ASCII, got {:?}", this.as_bytes()); + this + } + + fn as_bytes(&self) -> &[u8] { + &self.data[usize::from(self.alive.start)..usize::from(self.alive.end)] + } + + pub fn as_str(&self) -> &str { + // SAFETY: self.data[self.alive] is all ASCII characters. + unsafe { crate::str::from_utf8_unchecked(self.as_bytes()) } + } + + pub fn len(&self) -> usize { + usize::from(self.alive.end - self.alive.start) + } + + pub fn next(&mut self) -> Option { + self.alive.next().map(|i| self.data[usize::from(i)]) + } + + pub fn next_back(&mut self) -> Option { + self.alive.next_back().map(|i| self.data[usize::from(i)]) + } + + pub fn advance_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { + self.alive.advance_by(n) + } + + pub fn advance_back_by(&mut self, n: usize) -> Result<(), NonZeroUsize> { + self.alive.advance_back_by(n) + } +} diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 24a9d81d037..0aa8ea3b6f1 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -376,6 +376,7 @@ pub mod alloc; // note: does not need to be public mod bool; +mod escape; mod tuple; mod unit;