From 8f24c2ec9d43d3c616652a4040b8c6a1672b18d8 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Mon, 31 Aug 2020 19:00:49 +0200 Subject: [PATCH 1/3] Don't emit trivia tokens --- compiler/rustc_parse/src/lexer/mod.rs | 115 +++++++++---------- compiler/rustc_parse/src/lexer/tokentrees.rs | 16 +-- 2 files changed, 57 insertions(+), 74 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 1131f00cb42..034442b798b 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,5 +1,6 @@ use rustc_ast::ast::AttrStyle; use rustc_ast::token::{self, CommentKind, Token, TokenKind}; +use rustc_ast::tokenstream::IsJoint; use rustc_data_structures::sync::Lrc; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError}; use rustc_lexer::Base; @@ -65,42 +66,46 @@ impl<'a> StringReader<'a> { self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi)) } - /// Returns the next token, including trivia like whitespace or comments. - fn next_token(&mut self) -> Token { + /// Returns the next token, and info about preceding whitespace, if any. + fn next_token(&mut self) -> (IsJoint, Token) { + let mut is_joint = IsJoint::Joint; + + // Skip `#!` at the start of the file let start_src_index = self.src_index(self.pos); let text: &str = &self.src[start_src_index..self.end_src_index]; - - if text.is_empty() { - let span = self.mk_sp(self.pos, self.pos); - return Token::new(token::Eof, span); - } - - { - let is_beginning_of_file = self.pos == self.start_pos; - if is_beginning_of_file { - if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { - let start = self.pos; - self.pos = self.pos + BytePos::from_usize(shebang_len); - - let sym = self.symbol_from(start + BytePos::from_usize("#!".len())); - let kind = token::Shebang(sym); - - let span = self.mk_sp(start, self.pos); - return Token::new(kind, span); - } + let is_beginning_of_file = self.pos == self.start_pos; + if is_beginning_of_file { + if let Some(shebang_len) = rustc_lexer::strip_shebang(text) { + self.pos = self.pos + BytePos::from_usize(shebang_len); + is_joint = IsJoint::NonJoint; } } - let token = rustc_lexer::first_token(text); + // Skip trivial (whitespace & comments) tokens + loop { + let start_src_index = self.src_index(self.pos); + let text: &str = &self.src[start_src_index..self.end_src_index]; - let start = self.pos; - self.pos = self.pos + BytePos::from_usize(token.len); + if text.is_empty() { + let span = self.mk_sp(self.pos, self.pos); + return (is_joint, Token::new(token::Eof, span)); + } - debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start)); + let token = rustc_lexer::first_token(text); - let kind = self.cook_lexer_token(token.kind, start); - let span = self.mk_sp(start, self.pos); - Token::new(kind, span) + let start = self.pos; + self.pos = self.pos + BytePos::from_usize(token.len); + + debug!("next_token: {:?}({:?})", token.kind, self.str_from(start)); + + match self.cook_lexer_token(token.kind, start) { + Some(kind) => { + let span = self.mk_sp(start, self.pos); + return (is_joint, Token::new(kind, span)); + } + None => is_joint = IsJoint::NonJoint, + } + } } /// Report a fatal lexical error with a given span. @@ -140,19 +145,16 @@ impl<'a> StringReader<'a> { /// Turns simple `rustc_lexer::TokenKind` enum into a rich /// `librustc_ast::TokenKind`. This turns strings into interned /// symbols and runs additional validation. - fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind { - match token { + fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option { + Some(match token { rustc_lexer::TokenKind::LineComment { doc_style } => { - match doc_style { - Some(doc_style) => { - // Opening delimiter of the length 3 is not included into the symbol. - let content_start = start + BytePos(3); - let content = self.str_from(content_start); + // Skip non-doc comments + let doc_style = doc_style?; - self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) - } - None => token::Comment, - } + // Opening delimiter of the length 3 is not included into the symbol. + let content_start = start + BytePos(3); + let content = self.str_from(content_start); + self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) } rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { if !terminated { @@ -171,20 +173,18 @@ impl<'a> StringReader<'a> { .emit(); FatalError.raise(); } - match doc_style { - Some(doc_style) => { - // Opening delimiter of the length 3 and closing delimiter of the length 2 - // are not included into the symbol. - let content_start = start + BytePos(3); - let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); - let content = self.str_from_to(content_start, content_end); - self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) - } - None => token::Comment, - } + // Skip non-doc comments + let doc_style = doc_style?; + + // Opening delimiter of the length 3 and closing delimiter of the length 2 + // are not included into the symbol. + let content_start = start + BytePos(3); + let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); + let content = self.str_from_to(content_start, content_end); + self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) } - rustc_lexer::TokenKind::Whitespace => token::Whitespace, + rustc_lexer::TokenKind::Whitespace => return None, rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => { let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent; let mut ident_start = start; @@ -282,12 +282,11 @@ impl<'a> StringReader<'a> { // this should be inside `rustc_lexer`. However, we should first remove compound // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it, // as there will be less overall work to do this way. - let token = unicode_chars::check_for_substitution(self, start, c, &mut err) - .unwrap_or_else(|| token::Unknown(self.symbol_from(start))); + let token = unicode_chars::check_for_substitution(self, start, c, &mut err); err.emit(); - token + token? } - } + }) } fn cook_doc_comment( @@ -450,12 +449,6 @@ impl<'a> StringReader<'a> { self.str_from_to(start, self.pos) } - /// Creates a Symbol from a given offset to the current offset. - fn symbol_from(&self, start: BytePos) -> Symbol { - debug!("taking an ident from {:?} to {:?}", start, self.pos); - Symbol::intern(self.str_from(start)) - } - /// As symbol_from, with an explicit endpoint. fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol { debug!("taking an ident from {:?} to {:?}", start, end); diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index c08659ec9f6..6e13bfb9c9d 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -272,19 +272,9 @@ impl<'a> TokenTreesReader<'a> { } fn real_token(&mut self) { - self.joint_to_prev = Joint; - loop { - let token = self.string_reader.next_token(); - match token.kind { - token::Whitespace | token::Comment | token::Shebang(_) | token::Unknown(_) => { - self.joint_to_prev = NonJoint; - } - _ => { - self.token = token; - return; - } - } - } + let (joint_to_prev, token) = self.string_reader.next_token(); + self.joint_to_prev = joint_to_prev; + self.token = token; } } From 5326361fc00544f7ba77b250e9646cf05cd07c43 Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Tue, 1 Sep 2020 11:24:52 +0200 Subject: [PATCH 2/3] Remove trivia tokens --- compiler/rustc_ast/src/token.rs | 18 +++--------------- compiler/rustc_ast_pretty/src/pprust.rs | 4 ---- compiler/rustc_expand/src/proc_macro_server.rs | 2 +- .../rustc_parse/src/lexer/unicode_chars.rs | 2 +- compiler/rustc_parse/src/lib.rs | 7 ------- 5 files changed, 5 insertions(+), 28 deletions(-) diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs index 4a8bf6b4f19..c6cc890b47f 100644 --- a/compiler/rustc_ast/src/token.rs +++ b/compiler/rustc_ast/src/token.rs @@ -251,17 +251,6 @@ pub enum TokenKind { /// similarly to symbols in string literal tokens. DocComment(CommentKind, ast::AttrStyle, Symbol), - // Junk. These carry no data because we don't really care about the data - // they *would* carry, and don't really want to allocate a new ident for - // them. Instead, users could extract that from the associated span. - /// Whitespace. - Whitespace, - /// A comment. - Comment, - Shebang(Symbol), - /// A completely invalid token which should be skipped. - Unknown(Symbol), - Eof, } @@ -331,7 +320,7 @@ impl Token { /// Some token that will be thrown away later. pub fn dummy() -> Self { - Token::new(TokenKind::Whitespace, DUMMY_SP) + Token::new(TokenKind::Question, DUMMY_SP) } /// Recovers a `Token` from an `Ident`. This creates a raw identifier if necessary. @@ -360,7 +349,7 @@ impl Token { pub fn is_op(&self) -> bool { match self.kind { OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..) - | Lifetime(..) | Interpolated(..) | Whitespace | Comment | Shebang(..) | Eof => false, + | Lifetime(..) | Interpolated(..) | Eof => false, _ => true, } } @@ -676,8 +665,7 @@ impl Token { Le | EqEq | Ne | Ge | AndAnd | OrOr | Tilde | BinOpEq(..) | At | DotDotDot | DotDotEq | Comma | Semi | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar | Question | OpenDelim(..) | CloseDelim(..) | Literal(..) | Ident(..) - | Lifetime(..) | Interpolated(..) | DocComment(..) | Whitespace | Comment - | Shebang(..) | Unknown(..) | Eof => return None, + | Lifetime(..) | Interpolated(..) | DocComment(..) | Eof => return None, }; Some(Token::new(kind, self.span.to(joint.span))) diff --git a/compiler/rustc_ast_pretty/src/pprust.rs b/compiler/rustc_ast_pretty/src/pprust.rs index cb48deb5886..aa790878c52 100644 --- a/compiler/rustc_ast_pretty/src/pprust.rs +++ b/compiler/rustc_ast_pretty/src/pprust.rs @@ -289,10 +289,6 @@ fn token_kind_to_string_ext(tok: &TokenKind, convert_dollar_crate: Option) doc_comment_to_string(comment_kind, attr_style, data) } token::Eof => "".to_string(), - token::Whitespace => " ".to_string(), - token::Comment => "/* */".to_string(), - token::Shebang(s) => format!("/* shebang: {}*/", s), - token::Unknown(s) => s.to_string(), token::Interpolated(ref nt) => nonterminal_to_string(nt), } diff --git a/compiler/rustc_expand/src/proc_macro_server.rs b/compiler/rustc_expand/src/proc_macro_server.rs index 409784812f5..39c82f97e0a 100644 --- a/compiler/rustc_expand/src/proc_macro_server.rs +++ b/compiler/rustc_expand/src/proc_macro_server.rs @@ -189,7 +189,7 @@ impl FromInternal<(TreeAndJoint, &'_ ParseSess, &'_ mut Vec)> } OpenDelim(..) | CloseDelim(..) => unreachable!(), - Whitespace | Comment | Shebang(..) | Unknown(..) | Eof => unreachable!(), + Eof => unreachable!(), } } } diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index ac395f6cbc2..8dc0db01ecb 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -303,7 +303,7 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[ // However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add // fancier error recovery to it, as there will be less overall work to do this way. const ASCII_ARRAY: &[(char, &str, Option)] = &[ - (' ', "Space", Some(token::Whitespace)), + (' ', "Space", None), ('_', "Underscore", Some(token::Ident(kw::Underscore, false))), ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))), (',', "Comma", Some(token::Comma)), diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs index bc857c97742..462279b0a9e 100644 --- a/compiler/rustc_parse/src/lib.rs +++ b/compiler/rustc_parse/src/lib.rs @@ -348,9 +348,6 @@ pub fn tokenstream_probably_equal_for_proc_macro( | token::CloseDelim(DelimToken::NoDelim) // The pretty printer collapses many semicolons into one. | token::Semi - // The pretty printer collapses whitespace arbitrarily and can - // introduce whitespace from `NoDelim`. - | token::Whitespace // The pretty printer can turn `$crate` into `::crate_name` | token::ModSep = token.kind { return false; @@ -506,8 +503,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool { | (&Pound, &Pound) | (&Dollar, &Dollar) | (&Question, &Question) - | (&Whitespace, &Whitespace) - | (&Comment, &Comment) | (&Eof, &Eof) => true, (&BinOp(a), &BinOp(b)) | (&BinOpEq(a), &BinOpEq(b)) => a == b, @@ -516,8 +511,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool { (&DocComment(a1, a2, a3), &DocComment(b1, b2, b3)) => a1 == b1 && a2 == b2 && a3 == b3, - (&Shebang(a), &Shebang(b)) => a == b, - (&Literal(a), &Literal(b)) => a == b, (&Lifetime(a), &Lifetime(b)) => a == b, From fabd8a68345270c053cce906e5f037e0cfe7b6ef Mon Sep 17 00:00:00 2001 From: Aleksey Kladov Date: Tue, 1 Sep 2020 11:36:04 +0200 Subject: [PATCH 3/3] Simplify TokenTreesReader This `joint_to_prev` bit of state is no longer needed. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 6e13bfb9c9d..d5977ca3c7d 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -16,7 +16,6 @@ impl<'a> StringReader<'a> { let mut tt_reader = TokenTreesReader { string_reader: self, token: Token::dummy(), - joint_to_prev: Joint, open_braces: Vec::new(), unmatched_braces: Vec::new(), matching_delim_spans: Vec::new(), @@ -32,7 +31,6 @@ impl<'a> StringReader<'a> { struct TokenTreesReader<'a> { string_reader: StringReader<'a>, token: Token, - joint_to_prev: IsJoint, /// Stack of open delimiters and their spans. Used for error message. open_braces: Vec<(token::DelimToken, Span)>, unmatched_braces: Vec, @@ -53,7 +51,7 @@ impl<'a> TokenTreesReader<'a> { fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> { let mut buf = TokenStreamBuilder::default(); - self.real_token(); + self.bump(); while self.token != token::Eof { buf.push(self.parse_token_tree()?); } @@ -126,7 +124,7 @@ impl<'a> TokenTreesReader<'a> { // Parse the open delimiter. self.open_braces.push((delim, self.token.span)); - self.real_token(); + self.bump(); // Parse the token trees within the delimiters. // We stop at any delimiter so we can try to recover if the user @@ -171,7 +169,7 @@ impl<'a> TokenTreesReader<'a> { )); } // Parse the closing delimiter. - self.real_token(); + self.bump(); } // Incorrect delimiter. token::CloseDelim(other) => { @@ -217,7 +215,7 @@ impl<'a> TokenTreesReader<'a> { // bar(baz( // } // Incorrect delimiter but matches the earlier `{` if !self.open_braces.iter().any(|&(b, _)| b == other) { - self.real_token(); + self.bump(); } } token::Eof => { @@ -264,17 +262,19 @@ impl<'a> TokenTreesReader<'a> { } _ => { let tt = TokenTree::Token(self.token.take()); - self.real_token(); - let is_joint = self.joint_to_prev == Joint && self.token.is_op(); - Ok((tt, if is_joint { Joint } else { NonJoint })) + let mut is_joint = self.bump(); + if !self.token.is_op() { + is_joint = NonJoint; + } + Ok((tt, is_joint)) } } } - fn real_token(&mut self) { + fn bump(&mut self) -> IsJoint { let (joint_to_prev, token) = self.string_reader.next_token(); - self.joint_to_prev = joint_to_prev; self.token = token; + joint_to_prev } }