Auto merge of #76170 - matklad:notrivia, r=petrochenkov

Remove trivia tokens

r? @ghost
This commit is contained in:
bors 2020-09-02 03:19:38 +00:00
commit b4acb11033
7 changed files with 72 additions and 112 deletions

View File

@ -251,17 +251,6 @@ pub enum TokenKind {
/// similarly to symbols in string literal tokens. /// similarly to symbols in string literal tokens.
DocComment(CommentKind, ast::AttrStyle, Symbol), DocComment(CommentKind, ast::AttrStyle, Symbol),
// Junk. These carry no data because we don't really care about the data
// they *would* carry, and don't really want to allocate a new ident for
// them. Instead, users could extract that from the associated span.
/// Whitespace.
Whitespace,
/// A comment.
Comment,
Shebang(Symbol),
/// A completely invalid token which should be skipped.
Unknown(Symbol),
Eof, Eof,
} }
@ -331,7 +320,7 @@ impl Token {
/// Some token that will be thrown away later. /// Some token that will be thrown away later.
pub fn dummy() -> Self { pub fn dummy() -> Self {
Token::new(TokenKind::Whitespace, DUMMY_SP) Token::new(TokenKind::Question, DUMMY_SP)
} }
/// Recovers a `Token` from an `Ident`. This creates a raw identifier if necessary. /// Recovers a `Token` from an `Ident`. This creates a raw identifier if necessary.
@ -360,7 +349,7 @@ impl Token {
pub fn is_op(&self) -> bool { pub fn is_op(&self) -> bool {
match self.kind { match self.kind {
OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..) OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..)
| Lifetime(..) | Interpolated(..) | Whitespace | Comment | Shebang(..) | Eof => false, | Lifetime(..) | Interpolated(..) | Eof => false,
_ => true, _ => true,
} }
} }
@ -676,8 +665,7 @@ impl Token {
Le | EqEq | Ne | Ge | AndAnd | OrOr | Tilde | BinOpEq(..) | At | DotDotDot Le | EqEq | Ne | Ge | AndAnd | OrOr | Tilde | BinOpEq(..) | At | DotDotDot
| DotDotEq | Comma | Semi | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar | DotDotEq | Comma | Semi | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar
| Question | OpenDelim(..) | CloseDelim(..) | Literal(..) | Ident(..) | Question | OpenDelim(..) | CloseDelim(..) | Literal(..) | Ident(..)
| Lifetime(..) | Interpolated(..) | DocComment(..) | Whitespace | Comment | Lifetime(..) | Interpolated(..) | DocComment(..) | Eof => return None,
| Shebang(..) | Unknown(..) | Eof => return None,
}; };
Some(Token::new(kind, self.span.to(joint.span))) Some(Token::new(kind, self.span.to(joint.span)))

View File

@ -289,10 +289,6 @@ fn token_kind_to_string_ext(tok: &TokenKind, convert_dollar_crate: Option<Span>)
doc_comment_to_string(comment_kind, attr_style, data) doc_comment_to_string(comment_kind, attr_style, data)
} }
token::Eof => "<eof>".to_string(), token::Eof => "<eof>".to_string(),
token::Whitespace => " ".to_string(),
token::Comment => "/* */".to_string(),
token::Shebang(s) => format!("/* shebang: {}*/", s),
token::Unknown(s) => s.to_string(),
token::Interpolated(ref nt) => nonterminal_to_string(nt), token::Interpolated(ref nt) => nonterminal_to_string(nt),
} }

View File

@ -189,7 +189,7 @@ impl FromInternal<(TreeAndJoint, &'_ ParseSess, &'_ mut Vec<Self>)>
} }
OpenDelim(..) | CloseDelim(..) => unreachable!(), OpenDelim(..) | CloseDelim(..) => unreachable!(),
Whitespace | Comment | Shebang(..) | Unknown(..) | Eof => unreachable!(), Eof => unreachable!(),
} }
} }
} }

View File

@ -1,5 +1,6 @@
use rustc_ast::ast::AttrStyle; use rustc_ast::ast::AttrStyle;
use rustc_ast::token::{self, CommentKind, Token, TokenKind}; use rustc_ast::token::{self, CommentKind, Token, TokenKind};
use rustc_ast::tokenstream::IsJoint;
use rustc_data_structures::sync::Lrc; use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError}; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base; use rustc_lexer::Base;
@ -65,42 +66,46 @@ impl<'a> StringReader<'a> {
self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi)) self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
} }
/// Returns the next token, including trivia like whitespace or comments. /// Returns the next token, and info about preceding whitespace, if any.
fn next_token(&mut self) -> Token { fn next_token(&mut self) -> (IsJoint, Token) {
let mut is_joint = IsJoint::Joint;
// Skip `#!` at the start of the file
let start_src_index = self.src_index(self.pos); let start_src_index = self.src_index(self.pos);
let text: &str = &self.src[start_src_index..self.end_src_index]; let text: &str = &self.src[start_src_index..self.end_src_index];
let is_beginning_of_file = self.pos == self.start_pos;
if text.is_empty() { if is_beginning_of_file {
let span = self.mk_sp(self.pos, self.pos); if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
return Token::new(token::Eof, span); self.pos = self.pos + BytePos::from_usize(shebang_len);
} is_joint = IsJoint::NonJoint;
{
let is_beginning_of_file = self.pos == self.start_pos;
if is_beginning_of_file {
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
let start = self.pos;
self.pos = self.pos + BytePos::from_usize(shebang_len);
let sym = self.symbol_from(start + BytePos::from_usize("#!".len()));
let kind = token::Shebang(sym);
let span = self.mk_sp(start, self.pos);
return Token::new(kind, span);
}
} }
} }
let token = rustc_lexer::first_token(text); // Skip trivial (whitespace & comments) tokens
loop {
let start_src_index = self.src_index(self.pos);
let text: &str = &self.src[start_src_index..self.end_src_index];
let start = self.pos; if text.is_empty() {
self.pos = self.pos + BytePos::from_usize(token.len); let span = self.mk_sp(self.pos, self.pos);
return (is_joint, Token::new(token::Eof, span));
}
debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start)); let token = rustc_lexer::first_token(text);
let kind = self.cook_lexer_token(token.kind, start); let start = self.pos;
let span = self.mk_sp(start, self.pos); self.pos = self.pos + BytePos::from_usize(token.len);
Token::new(kind, span)
debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
match self.cook_lexer_token(token.kind, start) {
Some(kind) => {
let span = self.mk_sp(start, self.pos);
return (is_joint, Token::new(kind, span));
}
None => is_joint = IsJoint::NonJoint,
}
}
} }
/// Report a fatal lexical error with a given span. /// Report a fatal lexical error with a given span.
@ -140,19 +145,16 @@ impl<'a> StringReader<'a> {
/// Turns simple `rustc_lexer::TokenKind` enum into a rich /// Turns simple `rustc_lexer::TokenKind` enum into a rich
/// `librustc_ast::TokenKind`. This turns strings into interned /// `librustc_ast::TokenKind`. This turns strings into interned
/// symbols and runs additional validation. /// symbols and runs additional validation.
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind { fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
match token { Some(match token {
rustc_lexer::TokenKind::LineComment { doc_style } => { rustc_lexer::TokenKind::LineComment { doc_style } => {
match doc_style { // Skip non-doc comments
Some(doc_style) => { let doc_style = doc_style?;
// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) // Opening delimiter of the length 3 is not included into the symbol.
} let content_start = start + BytePos(3);
None => token::Comment, let content = self.str_from(content_start);
} self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
} }
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated { if !terminated {
@ -171,20 +173,18 @@ impl<'a> StringReader<'a> {
.emit(); .emit();
FatalError.raise(); FatalError.raise();
} }
match doc_style {
Some(doc_style) => {
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) // Skip non-doc comments
} let doc_style = doc_style?;
None => token::Comment,
} // Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
} }
rustc_lexer::TokenKind::Whitespace => token::Whitespace, rustc_lexer::TokenKind::Whitespace => return None,
rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => { rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => {
let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent; let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent;
let mut ident_start = start; let mut ident_start = start;
@ -282,12 +282,11 @@ impl<'a> StringReader<'a> {
// this should be inside `rustc_lexer`. However, we should first remove compound // this should be inside `rustc_lexer`. However, we should first remove compound
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it, // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
// as there will be less overall work to do this way. // as there will be less overall work to do this way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err) let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
.unwrap_or_else(|| token::Unknown(self.symbol_from(start)));
err.emit(); err.emit();
token token?
} }
} })
} }
fn cook_doc_comment( fn cook_doc_comment(
@ -450,12 +449,6 @@ impl<'a> StringReader<'a> {
self.str_from_to(start, self.pos) self.str_from_to(start, self.pos)
} }
/// Creates a Symbol from a given offset to the current offset.
fn symbol_from(&self, start: BytePos) -> Symbol {
debug!("taking an ident from {:?} to {:?}", start, self.pos);
Symbol::intern(self.str_from(start))
}
/// As symbol_from, with an explicit endpoint. /// As symbol_from, with an explicit endpoint.
fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol { fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
debug!("taking an ident from {:?} to {:?}", start, end); debug!("taking an ident from {:?} to {:?}", start, end);

View File

@ -16,7 +16,6 @@ impl<'a> StringReader<'a> {
let mut tt_reader = TokenTreesReader { let mut tt_reader = TokenTreesReader {
string_reader: self, string_reader: self,
token: Token::dummy(), token: Token::dummy(),
joint_to_prev: Joint,
open_braces: Vec::new(), open_braces: Vec::new(),
unmatched_braces: Vec::new(), unmatched_braces: Vec::new(),
matching_delim_spans: Vec::new(), matching_delim_spans: Vec::new(),
@ -32,7 +31,6 @@ impl<'a> StringReader<'a> {
struct TokenTreesReader<'a> { struct TokenTreesReader<'a> {
string_reader: StringReader<'a>, string_reader: StringReader<'a>,
token: Token, token: Token,
joint_to_prev: IsJoint,
/// Stack of open delimiters and their spans. Used for error message. /// Stack of open delimiters and their spans. Used for error message.
open_braces: Vec<(token::DelimToken, Span)>, open_braces: Vec<(token::DelimToken, Span)>,
unmatched_braces: Vec<UnmatchedBrace>, unmatched_braces: Vec<UnmatchedBrace>,
@ -53,7 +51,7 @@ impl<'a> TokenTreesReader<'a> {
fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> { fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> {
let mut buf = TokenStreamBuilder::default(); let mut buf = TokenStreamBuilder::default();
self.real_token(); self.bump();
while self.token != token::Eof { while self.token != token::Eof {
buf.push(self.parse_token_tree()?); buf.push(self.parse_token_tree()?);
} }
@ -126,7 +124,7 @@ impl<'a> TokenTreesReader<'a> {
// Parse the open delimiter. // Parse the open delimiter.
self.open_braces.push((delim, self.token.span)); self.open_braces.push((delim, self.token.span));
self.real_token(); self.bump();
// Parse the token trees within the delimiters. // Parse the token trees within the delimiters.
// We stop at any delimiter so we can try to recover if the user // We stop at any delimiter so we can try to recover if the user
@ -171,7 +169,7 @@ impl<'a> TokenTreesReader<'a> {
)); ));
} }
// Parse the closing delimiter. // Parse the closing delimiter.
self.real_token(); self.bump();
} }
// Incorrect delimiter. // Incorrect delimiter.
token::CloseDelim(other) => { token::CloseDelim(other) => {
@ -217,7 +215,7 @@ impl<'a> TokenTreesReader<'a> {
// bar(baz( // bar(baz(
// } // Incorrect delimiter but matches the earlier `{` // } // Incorrect delimiter but matches the earlier `{`
if !self.open_braces.iter().any(|&(b, _)| b == other) { if !self.open_braces.iter().any(|&(b, _)| b == other) {
self.real_token(); self.bump();
} }
} }
token::Eof => { token::Eof => {
@ -264,27 +262,19 @@ impl<'a> TokenTreesReader<'a> {
} }
_ => { _ => {
let tt = TokenTree::Token(self.token.take()); let tt = TokenTree::Token(self.token.take());
self.real_token(); let mut is_joint = self.bump();
let is_joint = self.joint_to_prev == Joint && self.token.is_op(); if !self.token.is_op() {
Ok((tt, if is_joint { Joint } else { NonJoint })) is_joint = NonJoint;
}
Ok((tt, is_joint))
} }
} }
} }
fn real_token(&mut self) { fn bump(&mut self) -> IsJoint {
self.joint_to_prev = Joint; let (joint_to_prev, token) = self.string_reader.next_token();
loop { self.token = token;
let token = self.string_reader.next_token(); joint_to_prev
match token.kind {
token::Whitespace | token::Comment | token::Shebang(_) | token::Unknown(_) => {
self.joint_to_prev = NonJoint;
}
_ => {
self.token = token;
return;
}
}
}
} }
} }

View File

@ -303,7 +303,7 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add // However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this way. // fancier error recovery to it, as there will be less overall work to do this way.
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[ const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
(' ', "Space", Some(token::Whitespace)), (' ', "Space", None),
('_', "Underscore", Some(token::Ident(kw::Underscore, false))), ('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))), ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
(',', "Comma", Some(token::Comma)), (',', "Comma", Some(token::Comma)),

View File

@ -348,9 +348,6 @@ pub fn tokenstream_probably_equal_for_proc_macro(
| token::CloseDelim(DelimToken::NoDelim) | token::CloseDelim(DelimToken::NoDelim)
// The pretty printer collapses many semicolons into one. // The pretty printer collapses many semicolons into one.
| token::Semi | token::Semi
// The pretty printer collapses whitespace arbitrarily and can
// introduce whitespace from `NoDelim`.
| token::Whitespace
// The pretty printer can turn `$crate` into `::crate_name` // The pretty printer can turn `$crate` into `::crate_name`
| token::ModSep = token.kind { | token::ModSep = token.kind {
return false; return false;
@ -506,8 +503,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {
| (&Pound, &Pound) | (&Pound, &Pound)
| (&Dollar, &Dollar) | (&Dollar, &Dollar)
| (&Question, &Question) | (&Question, &Question)
| (&Whitespace, &Whitespace)
| (&Comment, &Comment)
| (&Eof, &Eof) => true, | (&Eof, &Eof) => true,
(&BinOp(a), &BinOp(b)) | (&BinOpEq(a), &BinOpEq(b)) => a == b, (&BinOp(a), &BinOp(b)) | (&BinOpEq(a), &BinOpEq(b)) => a == b,
@ -516,8 +511,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {
(&DocComment(a1, a2, a3), &DocComment(b1, b2, b3)) => a1 == b1 && a2 == b2 && a3 == b3, (&DocComment(a1, a2, a3), &DocComment(b1, b2, b3)) => a1 == b1 && a2 == b2 && a3 == b3,
(&Shebang(a), &Shebang(b)) => a == b,
(&Literal(a), &Literal(b)) => a == b, (&Literal(a), &Literal(b)) => a == b,
(&Lifetime(a), &Lifetime(b)) => a == b, (&Lifetime(a), &Lifetime(b)) => a == b,