From 66e9b1149c7fbc1fb8108de72b9da1ec0f35afec Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 21 Sep 2022 14:01:39 +1000 Subject: [PATCH 01/15] Rearrange `TokenTreesReader::parse_token_tree`. `parse_token_tree` is basically a match with four arms: `Eof`, `OpenDelim`, `CloseDelim`, and "other". It has two call sites, and at each call site one of the arms is unreachable. It's also not inlined. This commit removes `parse_token_tree` by splitting it into four functions and inlining them. This avoids some repeated conditional tests and also some non-inlined function calls on the hot path. --- compiler/rustc_errors/src/lib.rs | 3 +- compiler/rustc_parse/src/lexer/tokentrees.rs | 384 +++++++++---------- 2 files changed, 189 insertions(+), 198 deletions(-) diff --git a/compiler/rustc_errors/src/lib.rs b/compiler/rustc_errors/src/lib.rs index b44cf352233..babab1fa112 100644 --- a/compiler/rustc_errors/src/lib.rs +++ b/compiler/rustc_errors/src/lib.rs @@ -63,7 +63,8 @@ pub mod translation; pub use diagnostic_builder::IntoDiagnostic; pub use snippet::Style; -pub type PResult<'a, T> = Result>; +pub type PErr<'a> = DiagnosticBuilder<'a, ErrorGuaranteed>; +pub type PResult<'a, T> = Result>; // `PResult` is used a lot. Make sure it doesn't unintentionally get bigger. // (See also the comment on `DiagnosticBuilder`'s `diagnostic` field.) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index aa70912dcde..3372544a579 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -4,7 +4,7 @@ use rustc_ast::token::{self, Delimiter, Token}; use rustc_ast::tokenstream::{DelimSpan, Spacing, TokenStream, TokenTree}; use rustc_ast_pretty::pprust::token_to_string; use rustc_data_structures::fx::FxHashMap; -use rustc_errors::PResult; +use rustc_errors::{PErr, PResult}; use rustc_span::Span; impl<'a> StringReader<'a> { @@ -48,220 +48,210 @@ impl<'a> TokenTreesReader<'a> { let mut buf = TokenStreamBuilder::default(); self.bump(); - while self.token != token::Eof { - buf.push(self.parse_token_tree()?); + loop { + match self.token.kind { + token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), + token::CloseDelim(delim) => return Err(self.close_delim_err(delim)), + token::Eof => return Ok(buf.into_token_stream()), + _ => buf.push(self.parse_token_tree_other()), + } } - - Ok(buf.into_token_stream()) } // Parse a stream of tokens into a list of `TokenTree`s, up to a `CloseDelim`. fn parse_token_trees_until_close_delim(&mut self) -> TokenStream { let mut buf = TokenStreamBuilder::default(); loop { - if let token::CloseDelim(..) = self.token.kind { - return buf.into_token_stream(); - } - - match self.parse_token_tree() { - Ok(tree) => buf.push(tree), - Err(mut e) => { - e.emit(); + match self.token.kind { + token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), + token::CloseDelim(..) => return buf.into_token_stream(), + token::Eof => { + let mut err = self.eof_err(); + err.emit(); return buf.into_token_stream(); } + _ => buf.push(self.parse_token_tree_other()), } } } - fn parse_token_tree(&mut self) -> PResult<'a, TokenTree> { - let sm = self.string_reader.sess.source_map(); + fn eof_err(&mut self) -> PErr<'a> { + let msg = "this file contains an unclosed delimiter"; + let mut err = self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, msg); + for &(_, sp) in &self.open_braces { + err.span_label(sp, "unclosed delimiter"); + self.unmatched_braces.push(UnmatchedBrace { + expected_delim: Delimiter::Brace, + found_delim: None, + found_span: self.token.span, + unclosed_span: Some(sp), + candidate_span: None, + }); + } - match self.token.kind { - token::Eof => { - let msg = "this file contains an unclosed delimiter"; - let mut err = - self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, msg); - for &(_, sp) in &self.open_braces { - err.span_label(sp, "unclosed delimiter"); - self.unmatched_braces.push(UnmatchedBrace { - expected_delim: Delimiter::Brace, - found_delim: None, - found_span: self.token.span, - unclosed_span: Some(sp), - candidate_span: None, - }); - } - - if let Some((delim, _)) = self.open_braces.last() { - if let Some((_, open_sp, close_sp)) = - self.matching_delim_spans.iter().find(|(d, open_sp, close_sp)| { - if let Some(close_padding) = sm.span_to_margin(*close_sp) { - if let Some(open_padding) = sm.span_to_margin(*open_sp) { - return delim == d && close_padding != open_padding; - } - } - false - }) - // these are in reverse order as they get inserted on close, but - { - // we want the last open/first close - err.span_label(*open_sp, "this delimiter might not be properly closed..."); - err.span_label( - *close_sp, - "...as it matches this but it has different indentation", - ); - } - } - Err(err) - } - token::OpenDelim(delim) => { - // The span for beginning of the delimited section - let pre_span = self.token.span; - - // Parse the open delimiter. - self.open_braces.push((delim, self.token.span)); - self.bump(); - - // Parse the token trees within the delimiters. - // We stop at any delimiter so we can try to recover if the user - // uses an incorrect delimiter. - let tts = self.parse_token_trees_until_close_delim(); - - // Expand to cover the entire delimited token tree - let delim_span = DelimSpan::from_pair(pre_span, self.token.span); - - match self.token.kind { - // Correct delimiter. - token::CloseDelim(d) if d == delim => { - let (open_brace, open_brace_span) = self.open_braces.pop().unwrap(); - let close_brace_span = self.token.span; - - if tts.is_empty() { - let empty_block_span = open_brace_span.to(close_brace_span); - if !sm.is_multiline(empty_block_span) { - // Only track if the block is in the form of `{}`, otherwise it is - // likely that it was written on purpose. - self.last_delim_empty_block_spans.insert(delim, empty_block_span); - } - } - - //only add braces - if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, delim) { - self.matching_block_spans.push((open_brace_span, close_brace_span)); - } - - if self.open_braces.is_empty() { - // Clear up these spans to avoid suggesting them as we've found - // properly matched delimiters so far for an entire block. - self.matching_delim_spans.clear(); - } else { - self.matching_delim_spans.push(( - open_brace, - open_brace_span, - close_brace_span, - )); - } - // Parse the closing delimiter. - self.bump(); - } - // Incorrect delimiter. - token::CloseDelim(other) => { - let mut unclosed_delimiter = None; - let mut candidate = None; - - if self.last_unclosed_found_span != Some(self.token.span) { - // do not complain about the same unclosed delimiter multiple times - self.last_unclosed_found_span = Some(self.token.span); - // This is a conservative error: only report the last unclosed - // delimiter. The previous unclosed delimiters could actually be - // closed! The parser just hasn't gotten to them yet. - if let Some(&(_, sp)) = self.open_braces.last() { - unclosed_delimiter = Some(sp); - }; - if let Some(current_padding) = sm.span_to_margin(self.token.span) { - for (brace, brace_span) in &self.open_braces { - if let Some(padding) = sm.span_to_margin(*brace_span) { - // high likelihood of these two corresponding - if current_padding == padding && brace == &other { - candidate = Some(*brace_span); - } - } - } - } - let (tok, _) = self.open_braces.pop().unwrap(); - self.unmatched_braces.push(UnmatchedBrace { - expected_delim: tok, - found_delim: Some(other), - found_span: self.token.span, - unclosed_span: unclosed_delimiter, - candidate_span: candidate, - }); - } else { - self.open_braces.pop(); - } - - // If the incorrect delimiter matches an earlier opening - // delimiter, then don't consume it (it can be used to - // close the earlier one). Otherwise, consume it. - // E.g., we try to recover from: - // fn foo() { - // bar(baz( - // } // Incorrect delimiter but matches the earlier `{` - if !self.open_braces.iter().any(|&(b, _)| b == other) { - self.bump(); + if let Some((delim, _)) = self.open_braces.last() { + if let Some((_, open_sp, close_sp)) = + self.matching_delim_spans.iter().find(|(d, open_sp, close_sp)| { + let sm = self.string_reader.sess.source_map(); + if let Some(close_padding) = sm.span_to_margin(*close_sp) { + if let Some(open_padding) = sm.span_to_margin(*open_sp) { + return delim == d && close_padding != open_padding; } } - token::Eof => { - // Silently recover, the EOF token will be seen again - // and an error emitted then. Thus we don't pop from - // self.open_braces here. - } - _ => {} - } - - Ok(TokenTree::Delimited(delim_span, delim, tts)) - } - token::CloseDelim(delim) => { - // An unexpected closing delimiter (i.e., there is no - // matching opening delimiter). - let token_str = token_to_string(&self.token); - let msg = format!("unexpected closing delimiter: `{}`", token_str); - let mut err = - self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, &msg); - - // Braces are added at the end, so the last element is the biggest block - if let Some(parent) = self.matching_block_spans.last() { - if let Some(span) = self.last_delim_empty_block_spans.remove(&delim) { - // Check if the (empty block) is in the last properly closed block - if (parent.0.to(parent.1)).contains(span) { - err.span_label( - span, - "block is empty, you might have not meant to close it", - ); - } else { - err.span_label(parent.0, "this opening brace..."); - - err.span_label(parent.1, "...matches this closing brace"); - } - } else { - err.span_label(parent.0, "this opening brace..."); - - err.span_label(parent.1, "...matches this closing brace"); - } - } - - err.span_label(self.token.span, "unexpected closing delimiter"); - Err(err) - } - _ => { - let tok = self.token.take(); - let mut spacing = self.bump(); - if !self.token.is_op() { - spacing = Spacing::Alone; - } - Ok(TokenTree::Token(tok, spacing)) + false + }) + // these are in reverse order as they get inserted on close, but + { + // we want the last open/first close + err.span_label(*open_sp, "this delimiter might not be properly closed..."); + err.span_label(*close_sp, "...as it matches this but it has different indentation"); } } + err + } + + fn parse_token_tree_open_delim(&mut self, delim: Delimiter) -> TokenTree { + // The span for beginning of the delimited section + let pre_span = self.token.span; + + // Parse the open delimiter. + self.open_braces.push((delim, self.token.span)); + self.bump(); + + // Parse the token trees within the delimiters. + // We stop at any delimiter so we can try to recover if the user + // uses an incorrect delimiter. + let tts = self.parse_token_trees_until_close_delim(); + + // Expand to cover the entire delimited token tree + let delim_span = DelimSpan::from_pair(pre_span, self.token.span); + + match self.token.kind { + // Correct delimiter. + token::CloseDelim(d) if d == delim => { + let (open_brace, open_brace_span) = self.open_braces.pop().unwrap(); + let close_brace_span = self.token.span; + + if tts.is_empty() { + let empty_block_span = open_brace_span.to(close_brace_span); + let sm = self.string_reader.sess.source_map(); + if !sm.is_multiline(empty_block_span) { + // Only track if the block is in the form of `{}`, otherwise it is + // likely that it was written on purpose. + self.last_delim_empty_block_spans.insert(delim, empty_block_span); + } + } + + //only add braces + if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, delim) { + self.matching_block_spans.push((open_brace_span, close_brace_span)); + } + + if self.open_braces.is_empty() { + // Clear up these spans to avoid suggesting them as we've found + // properly matched delimiters so far for an entire block. + self.matching_delim_spans.clear(); + } else { + self.matching_delim_spans.push((open_brace, open_brace_span, close_brace_span)); + } + // Parse the closing delimiter. + self.bump(); + } + // Incorrect delimiter. + token::CloseDelim(other) => { + let mut unclosed_delimiter = None; + let mut candidate = None; + + if self.last_unclosed_found_span != Some(self.token.span) { + // do not complain about the same unclosed delimiter multiple times + self.last_unclosed_found_span = Some(self.token.span); + // This is a conservative error: only report the last unclosed + // delimiter. The previous unclosed delimiters could actually be + // closed! The parser just hasn't gotten to them yet. + if let Some(&(_, sp)) = self.open_braces.last() { + unclosed_delimiter = Some(sp); + }; + let sm = self.string_reader.sess.source_map(); + if let Some(current_padding) = sm.span_to_margin(self.token.span) { + for (brace, brace_span) in &self.open_braces { + if let Some(padding) = sm.span_to_margin(*brace_span) { + // high likelihood of these two corresponding + if current_padding == padding && brace == &other { + candidate = Some(*brace_span); + } + } + } + } + let (tok, _) = self.open_braces.pop().unwrap(); + self.unmatched_braces.push(UnmatchedBrace { + expected_delim: tok, + found_delim: Some(other), + found_span: self.token.span, + unclosed_span: unclosed_delimiter, + candidate_span: candidate, + }); + } else { + self.open_braces.pop(); + } + + // If the incorrect delimiter matches an earlier opening + // delimiter, then don't consume it (it can be used to + // close the earlier one). Otherwise, consume it. + // E.g., we try to recover from: + // fn foo() { + // bar(baz( + // } // Incorrect delimiter but matches the earlier `{` + if !self.open_braces.iter().any(|&(b, _)| b == other) { + self.bump(); + } + } + token::Eof => { + // Silently recover, the EOF token will be seen again + // and an error emitted then. Thus we don't pop from + // self.open_braces here. + } + _ => {} + } + + TokenTree::Delimited(delim_span, delim, tts) + } + + fn close_delim_err(&mut self, delim: Delimiter) -> PErr<'a> { + // An unexpected closing delimiter (i.e., there is no + // matching opening delimiter). + let token_str = token_to_string(&self.token); + let msg = format!("unexpected closing delimiter: `{}`", token_str); + let mut err = + self.string_reader.sess.span_diagnostic.struct_span_err(self.token.span, &msg); + + // Braces are added at the end, so the last element is the biggest block + if let Some(parent) = self.matching_block_spans.last() { + if let Some(span) = self.last_delim_empty_block_spans.remove(&delim) { + // Check if the (empty block) is in the last properly closed block + if (parent.0.to(parent.1)).contains(span) { + err.span_label(span, "block is empty, you might have not meant to close it"); + } else { + err.span_label(parent.0, "this opening brace..."); + err.span_label(parent.1, "...matches this closing brace"); + } + } else { + err.span_label(parent.0, "this opening brace..."); + err.span_label(parent.1, "...matches this closing brace"); + } + } + + err.span_label(self.token.span, "unexpected closing delimiter"); + err + } + + #[inline] + fn parse_token_tree_other(&mut self) -> TokenTree { + let tok = self.token.take(); + let mut spacing = self.bump(); + if !self.token.is_op() { + spacing = Spacing::Alone; + } + TokenTree::Token(tok, spacing) } fn bump(&mut self) -> Spacing { From 14281e614759f6e761a01acaf9dccbd0da4ff1ef Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 21 Sep 2022 15:13:20 +1000 Subject: [PATCH 02/15] Remove unnecessary `spacing` assignment. It has no useful effect. --- compiler/rustc_parse/src/lexer/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 63819a2f98d..394a5b86480 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -70,7 +70,6 @@ impl<'a> StringReader<'a> { && let Some(shebang_len) = rustc_lexer::strip_shebang(self.src) { self.pos = self.pos + BytePos::from_usize(shebang_len); - spacing = Spacing::Alone; } // Skip trivial (whitespace & comments) tokens From 9640d1c02354dc3167f775e56629aaf8974e78f7 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 21 Sep 2022 15:18:36 +1000 Subject: [PATCH 03/15] Move `#!` checking. Currently does the "is this a `#!` at the start of the file?" check for every single token(!) This commit moves it so it only happens once. --- compiler/rustc_parse/src/lexer/mod.rs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 394a5b86480..0e8a739fb62 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -38,10 +38,16 @@ pub struct UnmatchedBrace { pub(crate) fn parse_token_trees<'a>( sess: &'a ParseSess, - src: &'a str, - start_pos: BytePos, + mut src: &'a str, + mut start_pos: BytePos, override_span: Option, ) -> (PResult<'a, TokenStream>, Vec) { + // Skip `#!`, if present. + if let Some(shebang_len) = rustc_lexer::strip_shebang(src) { + src = &src[shebang_len..]; + start_pos = start_pos + BytePos::from_usize(shebang_len); + } + StringReader { sess, start_pos, pos: start_pos, src, override_span }.into_token_trees() } @@ -65,13 +71,6 @@ impl<'a> StringReader<'a> { fn next_token(&mut self) -> (Spacing, Token) { let mut spacing = Spacing::Joint; - // Skip `#!` at the start of the file - if self.pos == self.start_pos - && let Some(shebang_len) = rustc_lexer::strip_shebang(self.src) - { - self.pos = self.pos + BytePos::from_usize(shebang_len); - } - // Skip trivial (whitespace & comments) tokens loop { let start_src_index = self.src_index(self.pos); From d7928a92e588e11a6c7145cde9ccfd75c4c7cc01 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 21 Sep 2022 16:38:28 +1000 Subject: [PATCH 04/15] Clarify spacing computation. The spacing computation is done in two parts. In the first part `next_token` and `bump` use `Spacing::Alone` to mean "preceded by whitespace" and `Spacing::Joint` to mean the opposite. In the second part `parse_token_tree_other` then adjusts the `spacing` value to mean the usual thing (i.e. "is the following token joinable punctuation?"). This shift in meaning is very confusing and it took me some time to understand what was going on. This commit changes the first part to use a bool, and adds some comments, which makes things much clearer. --- compiler/rustc_parse/src/lexer/mod.rs | 15 ++++++++------- compiler/rustc_parse/src/lexer/tokentrees.rs | 18 ++++++++++++------ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 0e8a739fb62..7d5f736a6f4 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,7 +1,7 @@ use crate::lexer::unicode_chars::UNICODE_ARRAY; use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; -use rustc_ast::tokenstream::{Spacing, TokenStream}; +use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult}; use rustc_lexer::unescape::{self, Mode}; @@ -67,9 +67,10 @@ impl<'a> StringReader<'a> { self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi)) } - /// Returns the next token, and info about preceding whitespace, if any. - fn next_token(&mut self) -> (Spacing, Token) { - let mut spacing = Spacing::Joint; + /// Returns the next token, paired with a bool indicating if the token was + /// preceded by whitespace. + fn next_token(&mut self) -> (Token, bool) { + let mut preceded_by_whitespace = false; // Skip trivial (whitespace & comments) tokens loop { @@ -78,7 +79,7 @@ impl<'a> StringReader<'a> { if text.is_empty() { let span = self.mk_sp(self.pos, self.pos); - return (spacing, Token::new(token::Eof, span)); + return (Token::new(token::Eof, span), preceded_by_whitespace); } let token = rustc_lexer::first_token(text); @@ -91,9 +92,9 @@ impl<'a> StringReader<'a> { match self.cook_lexer_token(token.kind, start) { Some(kind) => { let span = self.mk_sp(start, self.pos); - return (spacing, Token::new(kind, span)); + return (Token::new(kind, span), preceded_by_whitespace); } - None => spacing = Spacing::Alone, + None => preceded_by_whitespace = true, } } } diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 3372544a579..fe95742972f 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -246,16 +246,22 @@ impl<'a> TokenTreesReader<'a> { #[inline] fn parse_token_tree_other(&mut self) -> TokenTree { + // `spacing` for the returned token is determined by the next token: + // its kind and its `preceded_by_whitespace` status. let tok = self.token.take(); - let mut spacing = self.bump(); - if !self.token.is_op() { - spacing = Spacing::Alone; - } + let is_next_tok_preceded_by_whitespace = self.bump(); + let spacing = if is_next_tok_preceded_by_whitespace || !self.token.is_op() { + Spacing::Alone + } else { + Spacing::Joint + }; TokenTree::Token(tok, spacing) } - fn bump(&mut self) -> Spacing { - let (spacing, token) = self.string_reader.next_token(); + // Set `self.token` to the next token. Returns a bool indicating if that + // token was preceded by whitespace. + fn bump(&mut self) -> bool { + let (token, spacing) = self.string_reader.next_token(); self.token = token; spacing } From 5b2075e03d90b53dddda4459ad299c1ffa9cf960 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 21 Sep 2022 17:03:09 +1000 Subject: [PATCH 05/15] Remove `TokenTreesReader::bump`. It's an unnecessary layer that obfuscates when I am looking for optimizations. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 26 +++++++------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index fe95742972f..ae82d09ba41 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -45,9 +45,8 @@ struct TokenTreesReader<'a> { impl<'a> TokenTreesReader<'a> { // Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`. fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> { + self.token = self.string_reader.next_token().0; let mut buf = TokenStreamBuilder::default(); - - self.bump(); loop { match self.token.kind { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), @@ -116,7 +115,7 @@ impl<'a> TokenTreesReader<'a> { // Parse the open delimiter. self.open_braces.push((delim, self.token.span)); - self.bump(); + self.token = self.string_reader.next_token().0; // Parse the token trees within the delimiters. // We stop at any delimiter so we can try to recover if the user @@ -155,7 +154,7 @@ impl<'a> TokenTreesReader<'a> { self.matching_delim_spans.push((open_brace, open_brace_span, close_brace_span)); } // Parse the closing delimiter. - self.bump(); + self.token = self.string_reader.next_token().0; } // Incorrect delimiter. token::CloseDelim(other) => { @@ -202,7 +201,7 @@ impl<'a> TokenTreesReader<'a> { // bar(baz( // } // Incorrect delimiter but matches the earlier `{` if !self.open_braces.iter().any(|&(b, _)| b == other) { - self.bump(); + self.token = self.string_reader.next_token().0; } } token::Eof => { @@ -248,22 +247,15 @@ impl<'a> TokenTreesReader<'a> { fn parse_token_tree_other(&mut self) -> TokenTree { // `spacing` for the returned token is determined by the next token: // its kind and its `preceded_by_whitespace` status. - let tok = self.token.take(); - let is_next_tok_preceded_by_whitespace = self.bump(); - let spacing = if is_next_tok_preceded_by_whitespace || !self.token.is_op() { + let this_tok = self.token.take(); + let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); + let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { Spacing::Alone } else { Spacing::Joint }; - TokenTree::Token(tok, spacing) - } - - // Set `self.token` to the next token. Returns a bool indicating if that - // token was preceded by whitespace. - fn bump(&mut self) -> bool { - let (token, spacing) = self.string_reader.next_token(); - self.token = token; - spacing + self.token = next_tok; + TokenTree::Token(this_tok, this_spacing) } } From 33ba2776c903dab45e4c9a8c9313ce5d59e69af1 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 12:57:37 +1000 Subject: [PATCH 06/15] Remove `ast::Token::take`. Instead of replacing `TokenTreesReader::token` in two steps, we can just do it in one, which is both simpler and faster. --- compiler/rustc_ast/src/token.rs | 7 +------ compiler/rustc_parse/src/lexer/tokentrees.rs | 3 +-- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/compiler/rustc_ast/src/token.rs b/compiler/rustc_ast/src/token.rs index 97dfb783767..fa6162c5184 100644 --- a/compiler/rustc_ast/src/token.rs +++ b/compiler/rustc_ast/src/token.rs @@ -13,7 +13,7 @@ use rustc_span::symbol::{kw, sym}; use rustc_span::symbol::{Ident, Symbol}; use rustc_span::{self, edition::Edition, Span, DUMMY_SP}; use std::borrow::Cow; -use std::{fmt, mem}; +use std::fmt; #[derive(Clone, Copy, PartialEq, Encodable, Decodable, Debug, HashStable_Generic)] pub enum CommentKind { @@ -335,11 +335,6 @@ impl Token { Token::new(Ident(ident.name, ident.is_raw_guess()), ident.span) } - /// Return this token by value and leave a dummy token in its place. - pub fn take(&mut self) -> Self { - mem::replace(self, Token::dummy()) - } - /// For interpolated tokens, returns a span of the fragment to which the interpolated /// token refers. For all other tokens this is just a regular span. /// It is particularly important to use this for identifiers and lifetimes diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index ae82d09ba41..c23090e7142 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -247,14 +247,13 @@ impl<'a> TokenTreesReader<'a> { fn parse_token_tree_other(&mut self) -> TokenTree { // `spacing` for the returned token is determined by the next token: // its kind and its `preceded_by_whitespace` status. - let this_tok = self.token.take(); let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { Spacing::Alone } else { Spacing::Joint }; - self.token = next_tok; + let this_tok = std::mem::replace(&mut self.token, next_tok); TokenTree::Token(this_tok, this_spacing) } } From 33516ac09af7038efce6332afdedc758a3943609 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 08:50:52 +1000 Subject: [PATCH 07/15] [ui] Rearrange `StringReader`/`TokenTreesReader` creation. `TokenTreesReader` wraps a `StringReader`, but the `into_token_trees` function obscures this. This commit moves to a more straightforward control flow. --- compiler/rustc_parse/src/lexer/mod.rs | 3 +- compiler/rustc_parse/src/lexer/tokentrees.rs | 37 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 7d5f736a6f4..bdc8e96b889 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -48,7 +48,8 @@ pub(crate) fn parse_token_trees<'a>( start_pos = start_pos + BytePos::from_usize(shebang_len); } - StringReader { sess, start_pos, pos: start_pos, src, override_span }.into_token_trees() + let string_reader = StringReader { sess, start_pos, pos: start_pos, src, override_span }; + tokentrees::TokenTreesReader::parse_token_trees(string_reader) } struct StringReader<'a> { diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index c23090e7142..749a640f92e 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -1,5 +1,4 @@ use super::{StringReader, UnmatchedBrace}; - use rustc_ast::token::{self, Delimiter, Token}; use rustc_ast::tokenstream::{DelimSpan, Spacing, TokenStream, TokenTree}; use rustc_ast_pretty::pprust::token_to_string; @@ -7,24 +6,7 @@ use rustc_data_structures::fx::FxHashMap; use rustc_errors::{PErr, PResult}; use rustc_span::Span; -impl<'a> StringReader<'a> { - pub(super) fn into_token_trees(self) -> (PResult<'a, TokenStream>, Vec) { - let mut tt_reader = TokenTreesReader { - string_reader: self, - token: Token::dummy(), - open_braces: Vec::new(), - unmatched_braces: Vec::new(), - matching_delim_spans: Vec::new(), - last_unclosed_found_span: None, - last_delim_empty_block_spans: FxHashMap::default(), - matching_block_spans: Vec::new(), - }; - let res = tt_reader.parse_all_token_trees(); - (res, tt_reader.unmatched_braces) - } -} - -struct TokenTreesReader<'a> { +pub(super) struct TokenTreesReader<'a> { string_reader: StringReader<'a>, token: Token, /// Stack of open delimiters and their spans. Used for error message. @@ -43,6 +25,23 @@ struct TokenTreesReader<'a> { } impl<'a> TokenTreesReader<'a> { + pub(super) fn parse_token_trees( + string_reader: StringReader<'a>, + ) -> (PResult<'a, TokenStream>, Vec) { + let mut tt_reader = TokenTreesReader { + string_reader, + token: Token::dummy(), + open_braces: Vec::new(), + unmatched_braces: Vec::new(), + matching_delim_spans: Vec::new(), + last_unclosed_found_span: None, + last_delim_empty_block_spans: FxHashMap::default(), + matching_block_spans: Vec::new(), + }; + let res = tt_reader.parse_all_token_trees(); + (res, tt_reader.unmatched_braces) + } + // Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`. fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> { self.token = self.string_reader.next_token().0; From aa6bfaf04b258e3e23d3f7063de4f2d37845ddec Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 09:18:23 +1000 Subject: [PATCH 08/15] Make `rustc_lexer::cursor::Cursor` public. `Cursor` is currently hidden, and the main tokenization path uses `rustc_lexer::first_token` which involves constructing a new `Cursor` for every single token, which is weird. Also, `first_token` also can't handle empty input, so callers have to check for that first. This commit makes `Cursor` public, so `StringReader` can contain a `Cursor`, which results in a simpler structure. The commit also changes `StringReader::advance_token` so it returns an `Option`, simplifying the the empty input case. --- compiler/rustc_lexer/src/cursor.rs | 4 ++-- compiler/rustc_lexer/src/lib.rs | 26 +++++++------------------- compiler/rustc_parse/src/lexer/mod.rs | 23 +++++++++++++---------- src/librustdoc/html/highlight.rs | 9 ++++----- 4 files changed, 26 insertions(+), 36 deletions(-) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 21557a9c854..df9b6afdf56 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -4,7 +4,7 @@ use std::str::Chars; /// /// Next characters can be peeked via `first` method, /// and position can be shifted forward via `bump` method. -pub(crate) struct Cursor<'a> { +pub struct Cursor<'a> { initial_len: usize, /// Iterator over chars. Slightly faster than a &str. chars: Chars<'a>, @@ -15,7 +15,7 @@ pub(crate) struct Cursor<'a> { pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { - pub(crate) fn new(input: &'a str) -> Cursor<'a> { + pub fn new(input: &'a str) -> Cursor<'a> { Cursor { initial_len: input.len(), chars: input.chars(), diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index a79c982649a..9182b649bf3 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -23,7 +23,7 @@ // We want to be able to build this crate with a stable compiler, so no // `#![feature]` attributes should be added. -mod cursor; +pub mod cursor; pub mod unescape; #[cfg(test)] @@ -219,13 +219,6 @@ pub fn strip_shebang(input: &str) -> Option { None } -/// Parses the first token from the provided input string. -#[inline] -pub fn first_token(input: &str) -> Token { - debug_assert!(!input.is_empty()); - Cursor::new(input).advance_token() -} - /// Validates a raw string literal. Used for getting more information about a /// problem with a `RawStr`/`RawByteStr` with a `None` field. #[inline] @@ -242,14 +235,7 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> /// Creates an iterator that produces tokens from the input string. pub fn tokenize(input: &str) -> impl Iterator + '_ { let mut cursor = Cursor::new(input); - std::iter::from_fn(move || { - if cursor.is_eof() { - None - } else { - cursor.reset_len_consumed(); - Some(cursor.advance_token()) - } - }) + std::iter::from_fn(move || cursor.advance_token()) } /// True if `c` is considered a whitespace according to Rust language definition. @@ -311,8 +297,8 @@ pub fn is_ident(string: &str) -> bool { impl Cursor<'_> { /// Parses a token from the input string. - fn advance_token(&mut self) -> Token { - let first_char = self.bump().unwrap(); + pub fn advance_token(&mut self) -> Option { + let first_char = self.bump()?; let token_kind = match first_char { // Slash, comment or block comment. '/' => match self.first() { @@ -433,7 +419,9 @@ impl Cursor<'_> { } _ => Unknown, }; - Token::new(token_kind, self.len_consumed()) + let res = Some(Token::new(token_kind, self.len_consumed())); + self.reset_len_consumed(); + res } fn line_comment(&mut self) -> TokenKind { diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index bdc8e96b889..c182e86332a 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -4,6 +4,7 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult}; +use rustc_lexer::cursor::Cursor; use rustc_lexer::unescape::{self, Mode}; use rustc_lexer::{Base, DocStyle, RawStrError}; use rustc_session::lint::builtin::{ @@ -48,7 +49,9 @@ pub(crate) fn parse_token_trees<'a>( start_pos = start_pos + BytePos::from_usize(shebang_len); } - let string_reader = StringReader { sess, start_pos, pos: start_pos, src, override_span }; + let cursor = Cursor::new(src); + let string_reader = + StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span }; tokentrees::TokenTreesReader::parse_token_trees(string_reader) } @@ -60,6 +63,8 @@ struct StringReader<'a> { pos: BytePos, /// Source text to tokenize. src: &'a str, + /// Cursor for getting lexer tokens. + cursor: Cursor<'a>, override_span: Option, } @@ -75,15 +80,13 @@ impl<'a> StringReader<'a> { // Skip trivial (whitespace & comments) tokens loop { - let start_src_index = self.src_index(self.pos); - let text: &str = &self.src[start_src_index..]; - - if text.is_empty() { - let span = self.mk_sp(self.pos, self.pos); - return (Token::new(token::Eof, span), preceded_by_whitespace); - } - - let token = rustc_lexer::first_token(text); + let token = match self.cursor.advance_token() { + Some(token) => token, + None => { + let span = self.mk_sp(self.pos, self.pos); + return (Token::new(token::Eof, span), preceded_by_whitespace); + } + }; let start = self.pos; self.pos = self.pos + BytePos(token.len); diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 8922bf37785..0870d6f3824 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -13,6 +13,7 @@ use std::collections::VecDeque; use std::fmt::{Display, Write}; use rustc_data_structures::fx::FxHashMap; +use rustc_lexer::cursor::Cursor; use rustc_lexer::{LiteralKind, TokenKind}; use rustc_span::edition::Edition; use rustc_span::symbol::Symbol; @@ -408,15 +409,13 @@ enum Highlight<'a> { struct TokenIter<'a> { src: &'a str, + cursor: Cursor<'a>, } impl<'a> Iterator for TokenIter<'a> { type Item = (TokenKind, &'a str); fn next(&mut self) -> Option<(TokenKind, &'a str)> { - if self.src.is_empty() { - return None; - } - let token = rustc_lexer::first_token(self.src); + let token = self.cursor.advance_token()?; let (text, rest) = self.src.split_at(token.len as usize); self.src = rest; Some((token.kind, text)) @@ -525,7 +524,7 @@ impl<'a> Classifier<'a> { /// Takes as argument the source code to HTML-ify, the rust edition to use and the source code /// file span which will be used later on by the `span_correspondance_map`. fn new(src: &str, file_span: Span, decoration_info: Option) -> Classifier<'_> { - let tokens = PeekIter::new(TokenIter { src }); + let tokens = PeekIter::new(TokenIter { src, cursor: Cursor::new(src) }); let decorations = decoration_info.map(Decorations::new); Classifier { tokens, From ceb25d125f98b82fe264e5b9d1b992f0766939a8 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 11:12:06 +1000 Subject: [PATCH 09/15] Use less DRY in `cook_lexer_token`. This is a case where a small amount of repetition results in code that is faster and easier to read. --- compiler/rustc_parse/src/lexer/mod.rs | 38 +++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index c182e86332a..0f9d585230e 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -201,28 +201,28 @@ impl<'a> StringReader<'a> { self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) } rustc_lexer::TokenKind::Whitespace => return None, - rustc_lexer::TokenKind::Ident - | rustc_lexer::TokenKind::RawIdent - | rustc_lexer::TokenKind::UnknownPrefix => { - let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent; - let is_unknown_prefix = token == rustc_lexer::TokenKind::UnknownPrefix; - let mut ident_start = start; - if is_raw_ident { - ident_start = ident_start + BytePos(2); - } - if is_unknown_prefix { - self.report_unknown_prefix(start); - } - let sym = nfc_normalize(self.str_from(ident_start)); + rustc_lexer::TokenKind::Ident => { + let sym = nfc_normalize(self.str_from(start)); let span = self.mk_sp(start, self.pos); self.sess.symbol_gallery.insert(sym, span); - if is_raw_ident { - if !sym.can_be_raw() { - self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); - } - self.sess.raw_identifier_spans.borrow_mut().push(span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::RawIdent => { + let sym = nfc_normalize(self.str_from(start + BytePos(2))); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + if !sym.can_be_raw() { + self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); } - token::Ident(sym, is_raw_ident) + self.sess.raw_identifier_spans.borrow_mut().push(span); + token::Ident(sym, true) + } + rustc_lexer::TokenKind::UnknownPrefix => { + self.report_unknown_prefix(start); + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) } rustc_lexer::TokenKind::InvalidIdent // Do not recover an identifier with emoji if the codepoint is a confusable From cc0022a3634dd4a931ac9f68e63017c959bc8be7 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 13:05:54 +1000 Subject: [PATCH 10/15] Rename some things. `Cursor` keeps track of the position within the current token. But it uses confusing names that don't make it clear that the "length consumed" is just within the current token. This commit renames things to make this clearer. --- compiler/rustc_lexer/src/cursor.rs | 12 ++++++------ compiler/rustc_lexer/src/lib.rs | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index df9b6afdf56..eceef59802e 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -5,7 +5,7 @@ use std::str::Chars; /// Next characters can be peeked via `first` method, /// and position can be shifted forward via `bump` method. pub struct Cursor<'a> { - initial_len: usize, + len_remaining: usize, /// Iterator over chars. Slightly faster than a &str. chars: Chars<'a>, #[cfg(debug_assertions)] @@ -17,7 +17,7 @@ pub(crate) const EOF_CHAR: char = '\0'; impl<'a> Cursor<'a> { pub fn new(input: &'a str) -> Cursor<'a> { Cursor { - initial_len: input.len(), + len_remaining: input.len(), chars: input.chars(), #[cfg(debug_assertions)] prev: EOF_CHAR, @@ -61,13 +61,13 @@ impl<'a> Cursor<'a> { } /// Returns amount of already consumed symbols. - pub(crate) fn len_consumed(&self) -> u32 { - (self.initial_len - self.chars.as_str().len()) as u32 + pub(crate) fn pos_within_token(&self) -> u32 { + (self.len_remaining - self.chars.as_str().len()) as u32 } /// Resets the number of bytes consumed to 0. - pub(crate) fn reset_len_consumed(&mut self) { - self.initial_len = self.chars.as_str().len(); + pub(crate) fn reset_pos_within_token(&mut self) { + self.len_remaining = self.chars.as_str().len(); } /// Moves to the next character. diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 9182b649bf3..69e772c6924 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -315,7 +315,7 @@ impl Cursor<'_> { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { let res = self.raw_double_quoted_string(1); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if res.is_ok() { self.eat_literal_suffix(); } @@ -330,7 +330,7 @@ impl Cursor<'_> { ('\'', _) => { self.bump(); let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -340,7 +340,7 @@ impl Cursor<'_> { ('"', _) => { self.bump(); let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -350,7 +350,7 @@ impl Cursor<'_> { ('r', '"') | ('r', '#') => { self.bump(); let res = self.raw_double_quoted_string(2); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if res.is_ok() { self.eat_literal_suffix(); } @@ -367,7 +367,7 @@ impl Cursor<'_> { // Numeric literal. c @ '0'..='9' => { let literal_kind = self.number(c); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); self.eat_literal_suffix(); TokenKind::Literal { kind: literal_kind, suffix_start } } @@ -406,7 +406,7 @@ impl Cursor<'_> { // String literal. '"' => { let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -419,8 +419,8 @@ impl Cursor<'_> { } _ => Unknown, }; - let res = Some(Token::new(token_kind, self.len_consumed())); - self.reset_len_consumed(); + let res = Some(Token::new(token_kind, self.pos_within_token())); + self.reset_pos_within_token(); res } @@ -606,7 +606,7 @@ impl Cursor<'_> { if !can_be_a_lifetime { let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); + let suffix_start = self.pos_within_token(); if terminated { self.eat_literal_suffix(); } @@ -631,7 +631,7 @@ impl Cursor<'_> { if self.first() == '\'' { self.bump(); let kind = Char { terminated: true }; - Literal { kind, suffix_start: self.len_consumed() } + Literal { kind, suffix_start: self.pos_within_token() } } else { Lifetime { starts_with_number } } @@ -712,7 +712,7 @@ impl Cursor<'_> { fn raw_string_unvalidated(&mut self, prefix_len: u32) -> Result { debug_assert!(self.prev() == 'r'); - let start_pos = self.len_consumed(); + let start_pos = self.pos_within_token(); let mut possible_terminator_offset = None; let mut max_hashes = 0; @@ -766,7 +766,7 @@ impl Cursor<'_> { // Keep track of possible terminators to give a hint about // where there might be a missing terminator possible_terminator_offset = - Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); + Some(self.pos_within_token() - start_pos - n_end_hashes + prefix_len); max_hashes = n_end_hashes; } } From da84f0f4c31914c14dd03628395e9c53f28b88b9 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 13:06:15 +1000 Subject: [PATCH 11/15] Add `rustc_lexer::TokenKind::Eof`. For alignment with `rust_ast::TokenKind::Eof`. Plus it's a bit faster, due to less `Option` manipulation in `StringReader::next_token`. --- compiler/rustc_lexer/src/lib.rs | 17 +++++++++++++---- compiler/rustc_parse/src/lexer/mod.rs | 10 ++-------- src/librustdoc/html/highlight.rs | 6 +++++- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 69e772c6924..18ebed7c70e 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -139,6 +139,9 @@ pub enum TokenKind { /// Unknown token, not expected by the lexer, e.g. "№" Unknown, + + /// End of input. + Eof, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] @@ -235,7 +238,10 @@ pub fn validate_raw_str(input: &str, prefix_len: u32) -> Result<(), RawStrError> /// Creates an iterator that produces tokens from the input string. pub fn tokenize(input: &str) -> impl Iterator + '_ { let mut cursor = Cursor::new(input); - std::iter::from_fn(move || cursor.advance_token()) + std::iter::from_fn(move || { + let token = cursor.advance_token(); + if token.kind != TokenKind::Eof { Some(token) } else { None } + }) } /// True if `c` is considered a whitespace according to Rust language definition. @@ -297,8 +303,11 @@ pub fn is_ident(string: &str) -> bool { impl Cursor<'_> { /// Parses a token from the input string. - pub fn advance_token(&mut self) -> Option { - let first_char = self.bump()?; + pub fn advance_token(&mut self) -> Token { + let first_char = match self.bump() { + Some(c) => c, + None => return Token::new(TokenKind::Eof, 0), + }; let token_kind = match first_char { // Slash, comment or block comment. '/' => match self.first() { @@ -419,7 +428,7 @@ impl Cursor<'_> { } _ => Unknown, }; - let res = Some(Token::new(token_kind, self.pos_within_token())); + let res = Token::new(token_kind, self.pos_within_token()); self.reset_pos_within_token(); res } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 0f9d585230e..67fefd19d8b 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -80,14 +80,7 @@ impl<'a> StringReader<'a> { // Skip trivial (whitespace & comments) tokens loop { - let token = match self.cursor.advance_token() { - Some(token) => token, - None => { - let span = self.mk_sp(self.pos, self.pos); - return (Token::new(token::Eof, span), preceded_by_whitespace); - } - }; - + let token = self.cursor.advance_token(); let start = self.pos; self.pos = self.pos + BytePos(token.len); @@ -327,6 +320,7 @@ impl<'a> StringReader<'a> { err.emit(); token? } + rustc_lexer::TokenKind::Eof => token::Eof, }) } diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index 0870d6f3824..ea65a6334c9 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -415,7 +415,10 @@ struct TokenIter<'a> { impl<'a> Iterator for TokenIter<'a> { type Item = (TokenKind, &'a str); fn next(&mut self) -> Option<(TokenKind, &'a str)> { - let token = self.cursor.advance_token()?; + let token = self.cursor.advance_token(); + if token.kind == TokenKind::Eof { + return None; + } let (text, rest) = self.src.split_at(token.len as usize); self.src = rest; Some((token.kind, text)) @@ -849,6 +852,7 @@ impl<'a> Classifier<'a> { Class::Ident(self.new_span(before, text)) } TokenKind::Lifetime { .. } => Class::Lifetime, + TokenKind::Eof => panic!("Eof in advance"), }; // Anything that didn't return above is the simple case where we the // class just spans a single token, so we can use the `string` method. From fb4dba0a17b6eb241b4fd3732b976ab38fe2cdc0 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 26 Sep 2022 12:12:58 +1000 Subject: [PATCH 12/15] Inline and remove `cook_lexer_token`. This is a small performance win, alas. --- compiler/rustc_parse/src/lexer/mod.rs | 347 +++++++++++++------------- 1 file changed, 175 insertions(+), 172 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 67fefd19d8b..151e80e2b3e 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -86,13 +86,182 @@ impl<'a> StringReader<'a> { debug!("next_token: {:?}({:?})", token.kind, self.str_from(start)); - match self.cook_lexer_token(token.kind, start) { - Some(kind) => { - let span = self.mk_sp(start, self.pos); - return (Token::new(kind, span), preceded_by_whitespace); + // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a + // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs + // additional validation. + let kind = match token.kind { + rustc_lexer::TokenKind::LineComment { doc_style } => { + // Skip non-doc comments + let Some(doc_style) = doc_style else { + self.lint_unicode_text_flow(start); + preceded_by_whitespace = true; + continue; + }; + + // Opening delimiter of the length 3 is not included into the symbol. + let content_start = start + BytePos(3); + let content = self.str_from(content_start); + self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) } - None => preceded_by_whitespace = true, - } + rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { + if !terminated { + self.report_unterminated_block_comment(start, doc_style); + } + + // Skip non-doc comments + let Some(doc_style) = doc_style else { + self.lint_unicode_text_flow(start); + preceded_by_whitespace = true; + continue; + }; + + // Opening delimiter of the length 3 and closing delimiter of the length 2 + // are not included into the symbol. + let content_start = start + BytePos(3); + let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); + let content = self.str_from_to(content_start, content_end); + self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) + } + rustc_lexer::TokenKind::Whitespace => { + preceded_by_whitespace = true; + continue; + } + rustc_lexer::TokenKind::Ident => { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::RawIdent => { + let sym = nfc_normalize(self.str_from(start + BytePos(2))); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + if !sym.can_be_raw() { + self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); + } + self.sess.raw_identifier_spans.borrow_mut().push(span); + token::Ident(sym, true) + } + rustc_lexer::TokenKind::UnknownPrefix => { + self.report_unknown_prefix(start); + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.symbol_gallery.insert(sym, span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::InvalidIdent + // Do not recover an identifier with emoji if the codepoint is a confusable + // with a recoverable substitution token, like `➖`. + if !UNICODE_ARRAY + .iter() + .any(|&(c, _, _)| { + let sym = self.str_from(start); + sym.chars().count() == 1 && c == sym.chars().next().unwrap() + }) => + { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default() + .push(span); + token::Ident(sym, false) + } + rustc_lexer::TokenKind::Literal { kind, suffix_start } => { + let suffix_start = start + BytePos(suffix_start); + let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); + let suffix = if suffix_start < self.pos { + let string = self.str_from(suffix_start); + if string == "_" { + self.sess + .span_diagnostic + .struct_span_warn( + self.mk_sp(suffix_start, self.pos), + "underscore literal suffix is not allowed", + ) + .warn( + "this was previously accepted by the compiler but is \ + being phased out; it will become a hard error in \ + a future release!", + ) + .note( + "see issue #42326 \ + \ + for more information", + ) + .emit(); + None + } else { + Some(Symbol::intern(string)) + } + } else { + None + }; + token::Literal(token::Lit { kind, symbol, suffix }) + } + rustc_lexer::TokenKind::Lifetime { starts_with_number } => { + // Include the leading `'` in the real identifier, for macro + // expansion purposes. See #12512 for the gory details of why + // this is necessary. + let lifetime_name = self.str_from(start); + if starts_with_number { + self.err_span_(start, self.pos, "lifetimes cannot start with a number"); + } + let ident = Symbol::intern(lifetime_name); + token::Lifetime(ident) + } + rustc_lexer::TokenKind::Semi => token::Semi, + rustc_lexer::TokenKind::Comma => token::Comma, + rustc_lexer::TokenKind::Dot => token::Dot, + rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis), + rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis), + rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace), + rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace), + rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket), + rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket), + rustc_lexer::TokenKind::At => token::At, + rustc_lexer::TokenKind::Pound => token::Pound, + rustc_lexer::TokenKind::Tilde => token::Tilde, + rustc_lexer::TokenKind::Question => token::Question, + rustc_lexer::TokenKind::Colon => token::Colon, + rustc_lexer::TokenKind::Dollar => token::Dollar, + rustc_lexer::TokenKind::Eq => token::Eq, + rustc_lexer::TokenKind::Bang => token::Not, + rustc_lexer::TokenKind::Lt => token::Lt, + rustc_lexer::TokenKind::Gt => token::Gt, + rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), + rustc_lexer::TokenKind::And => token::BinOp(token::And), + rustc_lexer::TokenKind::Or => token::BinOp(token::Or), + rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), + rustc_lexer::TokenKind::Star => token::BinOp(token::Star), + rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), + rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), + rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), + + rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { + let c = self.str_from(start).chars().next().unwrap(); + let mut err = + self.struct_err_span_char(start, self.pos, "unknown start of token", c); + // FIXME: the lexer could be used to turn the ASCII version of unicode + // homoglyphs, instead of keeping a table in `check_for_substitution`into the + // token. Ideally, this should be inside `rustc_lexer`. However, we should + // first remove compound tokens like `<<` from `rustc_lexer`, and then add + // fancier error recovery to it, as there will be less overall work to do this + // way. + let token = unicode_chars::check_for_substitution(self, start, c, &mut err); + if c == '\x00' { + err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used"); + } + err.emit(); + if let Some(token) = token { + token + } else { + preceded_by_whitespace = true; + continue; + } + } + rustc_lexer::TokenKind::Eof => token::Eof, + }; + let span = self.mk_sp(start, self.pos); + return (Token::new(kind, span), preceded_by_whitespace); } } @@ -158,172 +327,6 @@ impl<'a> StringReader<'a> { } } - /// Turns simple `rustc_lexer::TokenKind` enum into a rich - /// `rustc_ast::TokenKind`. This turns strings into interned - /// symbols and runs additional validation. - fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option { - Some(match token { - rustc_lexer::TokenKind::LineComment { doc_style } => { - // Skip non-doc comments - let Some(doc_style) = doc_style else { - self.lint_unicode_text_flow(start); - return None; - }; - - // Opening delimiter of the length 3 is not included into the symbol. - let content_start = start + BytePos(3); - let content = self.str_from(content_start); - self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style) - } - rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => { - if !terminated { - self.report_unterminated_block_comment(start, doc_style); - } - - // Skip non-doc comments - let Some(doc_style) = doc_style else { - self.lint_unicode_text_flow(start); - return None; - }; - - // Opening delimiter of the length 3 and closing delimiter of the length 2 - // are not included into the symbol. - let content_start = start + BytePos(3); - let content_end = self.pos - BytePos(if terminated { 2 } else { 0 }); - let content = self.str_from_to(content_start, content_end); - self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style) - } - rustc_lexer::TokenKind::Whitespace => return None, - rustc_lexer::TokenKind::Ident => { - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - token::Ident(sym, false) - } - rustc_lexer::TokenKind::RawIdent => { - let sym = nfc_normalize(self.str_from(start + BytePos(2))); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - if !sym.can_be_raw() { - self.err_span(span, &format!("`{}` cannot be a raw identifier", sym)); - } - self.sess.raw_identifier_spans.borrow_mut().push(span); - token::Ident(sym, true) - } - rustc_lexer::TokenKind::UnknownPrefix => { - self.report_unknown_prefix(start); - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.symbol_gallery.insert(sym, span); - token::Ident(sym, false) - } - rustc_lexer::TokenKind::InvalidIdent - // Do not recover an identifier with emoji if the codepoint is a confusable - // with a recoverable substitution token, like `➖`. - if !UNICODE_ARRAY - .iter() - .any(|&(c, _, _)| { - let sym = self.str_from(start); - sym.chars().count() == 1 && c == sym.chars().next().unwrap() - }) - => - { - let sym = nfc_normalize(self.str_from(start)); - let span = self.mk_sp(start, self.pos); - self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span); - token::Ident(sym, false) - } - rustc_lexer::TokenKind::Literal { kind, suffix_start } => { - let suffix_start = start + BytePos(suffix_start); - let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); - let suffix = if suffix_start < self.pos { - let string = self.str_from(suffix_start); - if string == "_" { - self.sess - .span_diagnostic - .struct_span_warn( - self.mk_sp(suffix_start, self.pos), - "underscore literal suffix is not allowed", - ) - .warn( - "this was previously accepted by the compiler but is \ - being phased out; it will become a hard error in \ - a future release!", - ) - .note( - "see issue #42326 \ - \ - for more information", - ) - .emit(); - None - } else { - Some(Symbol::intern(string)) - } - } else { - None - }; - token::Literal(token::Lit { kind, symbol, suffix }) - } - rustc_lexer::TokenKind::Lifetime { starts_with_number } => { - // Include the leading `'` in the real identifier, for macro - // expansion purposes. See #12512 for the gory details of why - // this is necessary. - let lifetime_name = self.str_from(start); - if starts_with_number { - self.err_span_(start, self.pos, "lifetimes cannot start with a number"); - } - let ident = Symbol::intern(lifetime_name); - token::Lifetime(ident) - } - rustc_lexer::TokenKind::Semi => token::Semi, - rustc_lexer::TokenKind::Comma => token::Comma, - rustc_lexer::TokenKind::Dot => token::Dot, - rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis), - rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis), - rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace), - rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace), - rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket), - rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket), - rustc_lexer::TokenKind::At => token::At, - rustc_lexer::TokenKind::Pound => token::Pound, - rustc_lexer::TokenKind::Tilde => token::Tilde, - rustc_lexer::TokenKind::Question => token::Question, - rustc_lexer::TokenKind::Colon => token::Colon, - rustc_lexer::TokenKind::Dollar => token::Dollar, - rustc_lexer::TokenKind::Eq => token::Eq, - rustc_lexer::TokenKind::Bang => token::Not, - rustc_lexer::TokenKind::Lt => token::Lt, - rustc_lexer::TokenKind::Gt => token::Gt, - rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus), - rustc_lexer::TokenKind::And => token::BinOp(token::And), - rustc_lexer::TokenKind::Or => token::BinOp(token::Or), - rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus), - rustc_lexer::TokenKind::Star => token::BinOp(token::Star), - rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash), - rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), - rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - - rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { - let c = self.str_from(start).chars().next().unwrap(); - let mut err = - self.struct_err_span_char(start, self.pos, "unknown start of token", c); - // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, - // instead of keeping a table in `check_for_substitution`into the token. Ideally, - // this should be inside `rustc_lexer`. However, we should first remove compound - // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it, - // as there will be less overall work to do this way. - let token = unicode_chars::check_for_substitution(self, start, c, &mut err); - if c == '\x00' { - err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used"); - } - err.emit(); - token? - } - rustc_lexer::TokenKind::Eof => token::Eof, - }) - } - fn cook_doc_comment( &self, content_start: BytePos, From 880ebb657a066ef039139592750f4f4ca45f8277 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 27 Sep 2022 09:53:04 +1000 Subject: [PATCH 13/15] Minor improvements. Add some comments, and mark one path as unreachable. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 749a640f92e..3aaf3d4865f 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -8,6 +8,8 @@ use rustc_span::Span; pub(super) struct TokenTreesReader<'a> { string_reader: StringReader<'a>, + /// The "next" token, which has been obtained from the `StringReader` but + /// not yet handled by the `TokenTreesReader`. token: Token, /// Stack of open delimiters and their spans. Used for error message. open_braces: Vec<(Delimiter, Span)>, @@ -112,7 +114,7 @@ impl<'a> TokenTreesReader<'a> { // The span for beginning of the delimited section let pre_span = self.token.span; - // Parse the open delimiter. + // Move past the open delimiter. self.open_braces.push((delim, self.token.span)); self.token = self.string_reader.next_token().0; @@ -152,7 +154,7 @@ impl<'a> TokenTreesReader<'a> { } else { self.matching_delim_spans.push((open_brace, open_brace_span, close_brace_span)); } - // Parse the closing delimiter. + // Move past the closing delimiter. self.token = self.string_reader.next_token().0; } // Incorrect delimiter. @@ -208,7 +210,7 @@ impl<'a> TokenTreesReader<'a> { // and an error emitted then. Thus we don't pop from // self.open_braces here. } - _ => {} + _ => unreachable!(), } TokenTree::Delimited(delim_span, delim, tts) From 7f7e2165b1f1a271c6708f2a54c940bdaa254eb2 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 27 Sep 2022 12:04:03 +1000 Subject: [PATCH 14/15] Rename some variables. These make the delimiter processing clearer. --- compiler/rustc_parse/src/lexer/tokentrees.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 3aaf3d4865f..6f6ab16cb59 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -110,12 +110,12 @@ impl<'a> TokenTreesReader<'a> { err } - fn parse_token_tree_open_delim(&mut self, delim: Delimiter) -> TokenTree { + fn parse_token_tree_open_delim(&mut self, open_delim: Delimiter) -> TokenTree { // The span for beginning of the delimited section let pre_span = self.token.span; // Move past the open delimiter. - self.open_braces.push((delim, self.token.span)); + self.open_braces.push((open_delim, self.token.span)); self.token = self.string_reader.next_token().0; // Parse the token trees within the delimiters. @@ -128,7 +128,7 @@ impl<'a> TokenTreesReader<'a> { match self.token.kind { // Correct delimiter. - token::CloseDelim(d) if d == delim => { + token::CloseDelim(close_delim) if close_delim == open_delim => { let (open_brace, open_brace_span) = self.open_braces.pop().unwrap(); let close_brace_span = self.token.span; @@ -138,12 +138,12 @@ impl<'a> TokenTreesReader<'a> { if !sm.is_multiline(empty_block_span) { // Only track if the block is in the form of `{}`, otherwise it is // likely that it was written on purpose. - self.last_delim_empty_block_spans.insert(delim, empty_block_span); + self.last_delim_empty_block_spans.insert(open_delim, empty_block_span); } } //only add braces - if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, delim) { + if let (Delimiter::Brace, Delimiter::Brace) = (open_brace, open_delim) { self.matching_block_spans.push((open_brace_span, close_brace_span)); } @@ -158,7 +158,7 @@ impl<'a> TokenTreesReader<'a> { self.token = self.string_reader.next_token().0; } // Incorrect delimiter. - token::CloseDelim(other) => { + token::CloseDelim(close_delim) => { let mut unclosed_delimiter = None; let mut candidate = None; @@ -176,7 +176,7 @@ impl<'a> TokenTreesReader<'a> { for (brace, brace_span) in &self.open_braces { if let Some(padding) = sm.span_to_margin(*brace_span) { // high likelihood of these two corresponding - if current_padding == padding && brace == &other { + if current_padding == padding && brace == &close_delim { candidate = Some(*brace_span); } } @@ -185,7 +185,7 @@ impl<'a> TokenTreesReader<'a> { let (tok, _) = self.open_braces.pop().unwrap(); self.unmatched_braces.push(UnmatchedBrace { expected_delim: tok, - found_delim: Some(other), + found_delim: Some(close_delim), found_span: self.token.span, unclosed_span: unclosed_delimiter, candidate_span: candidate, @@ -201,7 +201,7 @@ impl<'a> TokenTreesReader<'a> { // fn foo() { // bar(baz( // } // Incorrect delimiter but matches the earlier `{` - if !self.open_braces.iter().any(|&(b, _)| b == other) { + if !self.open_braces.iter().any(|&(b, _)| b == close_delim) { self.token = self.string_reader.next_token().0; } } @@ -213,7 +213,7 @@ impl<'a> TokenTreesReader<'a> { _ => unreachable!(), } - TokenTree::Delimited(delim_span, delim, tts) + TokenTree::Delimited(delim_span, open_delim, tts) } fn close_delim_err(&mut self, delim: Delimiter) -> PErr<'a> { From d0a26acb2ae2d000e516eca92ae8feb08d1f6ea0 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 28 Sep 2022 10:28:36 +1000 Subject: [PATCH 15/15] Address review comments. --- compiler/rustc_lexer/src/lib.rs | 6 ++++-- compiler/rustc_parse/src/lexer/mod.rs | 2 +- compiler/rustc_parse/src/lexer/tokentrees.rs | 14 +++++++------- src/librustdoc/html/highlight.rs | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 18ebed7c70e..c71e6ffe34d 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -23,15 +23,17 @@ // We want to be able to build this crate with a stable compiler, so no // `#![feature]` attributes should be added. -pub mod cursor; +mod cursor; pub mod unescape; #[cfg(test)] mod tests; +pub use crate::cursor::Cursor; + use self::LiteralKind::*; use self::TokenKind::*; -use crate::cursor::{Cursor, EOF_CHAR}; +use crate::cursor::EOF_CHAR; use std::convert::TryFrom; /// Parsed token. diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 151e80e2b3e..bcd078a8967 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -4,8 +4,8 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagnosticBuilder, ErrorGuaranteed, PResult}; -use rustc_lexer::cursor::Cursor; use rustc_lexer::unescape::{self, Mode}; +use rustc_lexer::Cursor; use rustc_lexer::{Base, DocStyle, RawStrError}; use rustc_session::lint::builtin::{ RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT, diff --git a/compiler/rustc_parse/src/lexer/tokentrees.rs b/compiler/rustc_parse/src/lexer/tokentrees.rs index 6f6ab16cb59..364753154db 100644 --- a/compiler/rustc_parse/src/lexer/tokentrees.rs +++ b/compiler/rustc_parse/src/lexer/tokentrees.rs @@ -53,7 +53,7 @@ impl<'a> TokenTreesReader<'a> { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), token::CloseDelim(delim) => return Err(self.close_delim_err(delim)), token::Eof => return Ok(buf.into_token_stream()), - _ => buf.push(self.parse_token_tree_other()), + _ => buf.push(self.parse_token_tree_non_delim_non_eof()), } } } @@ -66,11 +66,10 @@ impl<'a> TokenTreesReader<'a> { token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)), token::CloseDelim(..) => return buf.into_token_stream(), token::Eof => { - let mut err = self.eof_err(); - err.emit(); + self.eof_err().emit(); return buf.into_token_stream(); } - _ => buf.push(self.parse_token_tree_other()), + _ => buf.push(self.parse_token_tree_non_delim_non_eof()), } } } @@ -245,9 +244,10 @@ impl<'a> TokenTreesReader<'a> { } #[inline] - fn parse_token_tree_other(&mut self) -> TokenTree { - // `spacing` for the returned token is determined by the next token: - // its kind and its `preceded_by_whitespace` status. + fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree { + // `this_spacing` for the returned token refers to whether the token is + // immediately followed by another op token. It is determined by the + // next token: its kind and its `preceded_by_whitespace` status. let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token(); let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() { Spacing::Alone diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index ea65a6334c9..78b98431b19 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -13,7 +13,7 @@ use std::collections::VecDeque; use std::fmt::{Display, Write}; use rustc_data_structures::fx::FxHashMap; -use rustc_lexer::cursor::Cursor; +use rustc_lexer::Cursor; use rustc_lexer::{LiteralKind, TokenKind}; use rustc_span::edition::Edition; use rustc_span::symbol::Symbol;