diff --git a/clang/include/clang/AST/CommentLexer.h b/clang/include/clang/AST/CommentLexer.h index cad68e161c49..52c4eb9e309a 100644 --- a/clang/include/clang/AST/CommentLexer.h +++ b/clang/include/clang/AST/CommentLexer.h @@ -281,6 +281,11 @@ private: /// command, including command marker. SmallString<16> VerbatimBlockEndCommandName; + /// If true, the commands, html tags, etc will be parsed and reported as + /// separate tokens inside the comment body. If false, the comment text will + /// be parsed into text and newline tokens. + bool ParseCommands; + /// Given a character reference name (e.g., "lt"), return the character that /// it stands for (e.g., "<"). StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; @@ -315,12 +320,11 @@ private: /// Eat string matching regexp \code \s*\* \endcode. void skipLineStartingDecorations(); - /// Lex stuff inside comments. CommentEnd should be set correctly. + /// Lex comment text, including commands if ParseCommands is set to true. void lexCommentText(Token &T); - void setupAndLexVerbatimBlock(Token &T, - const char *TextBegin, - char Marker, const CommandInfo *Info); + void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, + const CommandInfo *Info); void lexVerbatimBlockFirstLine(Token &T); @@ -343,14 +347,13 @@ private: public: Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd); + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands = true); void lex(Token &T); - StringRef getSpelling(const Token &Tok, - const SourceManager &SourceMgr, + StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr, bool *Invalid = nullptr) const; }; diff --git a/clang/include/clang/AST/RawCommentList.h b/clang/include/clang/AST/RawCommentList.h index f561308b88ac..8327efc750fd 100644 --- a/clang/include/clang/AST/RawCommentList.h +++ b/clang/include/clang/AST/RawCommentList.h @@ -111,6 +111,30 @@ public: return extractBriefText(Context); } + /// Returns sanitized comment text, suitable for presentation in editor UIs. + /// E.g. will transform: + /// // This is a long multiline comment. + /// // Parts of it might be indented. + /// /* The comments styles might be mixed. */ + /// into + /// "This is a long multiline comment.\n" + /// " Parts of it might be indented.\n" + /// "The comments styles might be mixed." + /// Also removes leading indentation and sanitizes some common cases: + /// /* This is a first line. + /// * This is a second line. It is indented. + /// * This is a third line. */ + /// and + /// /* This is a first line. + /// This is a second line. It is indented. + /// This is a third line. */ + /// will both turn into: + /// "This is a first line.\n" + /// " This is a second line. It is indented.\n" + /// "This is a third line." + std::string getFormattedText(const SourceManager &SourceMgr, + DiagnosticsEngine &Diags) const; + /// Parse the comment, assuming it is attached to decl \c D. comments::FullComment *parse(const ASTContext &Context, const Preprocessor *PP, const Decl *D) const; diff --git a/clang/lib/AST/CommentLexer.cpp b/clang/lib/AST/CommentLexer.cpp index 65d0f56f09ab..6ff4d45a9572 100644 --- a/clang/lib/AST/CommentLexer.cpp +++ b/clang/lib/AST/CommentLexer.cpp @@ -294,6 +294,39 @@ void Lexer::lexCommentText(Token &T) { assert(CommentState == LCS_InsideBCPLComment || CommentState == LCS_InsideCComment); + // Handles lexing non-command text, i.e. text and newline. + auto HandleNonCommandToken = [&]() -> void { + assert(State == LS_Normal); + + const char *TokenPtr = BufferPtr; + assert(TokenPtr < CommentEnd); + switch (*TokenPtr) { + case '\n': + case '\r': + TokenPtr = skipNewline(TokenPtr, CommentEnd); + formTokenWithChars(T, TokenPtr, tok::newline); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + return; + + default: { + StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r"; + size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr) + .find_first_of(TokStartSymbols); + if (End != StringRef::npos) + TokenPtr += End; + else + TokenPtr = CommentEnd; + formTextToken(T, TokenPtr); + return; + } + } + }; + + if (!ParseCommands) + return HandleNonCommandToken(); + switch (State) { case LS_Normal: break; @@ -315,136 +348,116 @@ void Lexer::lexCommentText(Token &T) { } assert(State == LS_Normal); - const char *TokenPtr = BufferPtr; assert(TokenPtr < CommentEnd); - while (TokenPtr != CommentEnd) { - switch(*TokenPtr) { - case '\\': - case '@': { - // Commands that start with a backslash and commands that start with - // 'at' have equivalent semantics. But we keep information about the - // exact syntax in AST for comments. - tok::TokenKind CommandKind = - (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; - TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); - return; - } - char C = *TokenPtr; - switch (C) { - default: - break; - - case '\\': case '@': case '&': case '$': - case '#': case '<': case '>': case '%': - case '\"': case '.': case ':': - // This is one of \\ \@ \& \$ etc escape sequences. - TokenPtr++; - if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { - // This is the \:: escape sequence. - TokenPtr++; - } - StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(UnescapedText); - return; - } - - // Don't make zero-length commands. - if (!isCommandNameStartCharacter(*TokenPtr)) { - formTextToken(T, TokenPtr); - return; - } - - TokenPtr = skipCommandName(TokenPtr, CommentEnd); - unsigned Length = TokenPtr - (BufferPtr + 1); - - // Hardcoded support for lexing LaTeX formula commands - // \f$ \f[ \f] \f{ \f} as a single command. - if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { - C = *TokenPtr; - if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { - TokenPtr++; - Length++; - } - } - - StringRef CommandName(BufferPtr + 1, Length); - - const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); - if (!Info) { - if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { - StringRef CorrectedName = Info->Name; - SourceLocation Loc = getSourceLocation(BufferPtr); - SourceLocation EndLoc = getSourceLocation(TokenPtr); - SourceRange FullRange = SourceRange(Loc, EndLoc); - SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); - Diag(Loc, diag::warn_correct_comment_command_name) - << FullRange << CommandName << CorrectedName - << FixItHint::CreateReplacement(CommandRange, CorrectedName); - } else { - formTokenWithChars(T, TokenPtr, tok::unknown_command); - T.setUnknownCommandName(CommandName); - Diag(T.getLocation(), diag::warn_unknown_comment_command_name) - << SourceRange(T.getLocation(), T.getEndLocation()); - return; - } - } - if (Info->IsVerbatimBlockCommand) { - setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); - return; - } - if (Info->IsVerbatimLineCommand) { - setupAndLexVerbatimLine(T, TokenPtr, Info); - return; - } - formTokenWithChars(T, TokenPtr, CommandKind); - T.setCommandID(Info->getID()); - return; - } - - case '&': - lexHTMLCharacterReference(T); - return; - - case '<': { - TokenPtr++; - if (TokenPtr == CommentEnd) { - formTextToken(T, TokenPtr); - return; - } - const char C = *TokenPtr; - if (isHTMLIdentifierStartingCharacter(C)) - setupAndLexHTMLStartTag(T); - else if (C == '/') - setupAndLexHTMLEndTag(T); - else - formTextToken(T, TokenPtr); - return; - } - - case '\n': - case '\r': - TokenPtr = skipNewline(TokenPtr, CommentEnd); - formTokenWithChars(T, TokenPtr, tok::newline); - - if (CommentState == LCS_InsideCComment) - skipLineStartingDecorations(); - return; - - default: { - size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). - find_first_of("\n\r\\@&<"); - if (End != StringRef::npos) - TokenPtr += End; - else - TokenPtr = CommentEnd; + switch(*TokenPtr) { + case '\\': + case '@': { + // Commands that start with a backslash and commands that start with + // 'at' have equivalent semantics. But we keep information about the + // exact syntax in AST for comments. + tok::TokenKind CommandKind = + (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; + TokenPtr++; + if (TokenPtr == CommentEnd) { formTextToken(T, TokenPtr); return; } + char C = *TokenPtr; + switch (C) { + default: + break; + + case '\\': case '@': case '&': case '$': + case '#': case '<': case '>': case '%': + case '\"': case '.': case ':': + // This is one of \\ \@ \& \$ etc escape sequences. + TokenPtr++; + if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { + // This is the \:: escape sequence. + TokenPtr++; + } + StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(UnescapedText); + return; + } + + // Don't make zero-length commands. + if (!isCommandNameStartCharacter(*TokenPtr)) { + formTextToken(T, TokenPtr); + return; + } + + TokenPtr = skipCommandName(TokenPtr, CommentEnd); + unsigned Length = TokenPtr - (BufferPtr + 1); + + // Hardcoded support for lexing LaTeX formula commands + // \f$ \f[ \f] \f{ \f} as a single command. + if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { + C = *TokenPtr; + if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { + TokenPtr++; + Length++; + } + } + + StringRef CommandName(BufferPtr + 1, Length); + + const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); + if (!Info) { + if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { + StringRef CorrectedName = Info->Name; + SourceLocation Loc = getSourceLocation(BufferPtr); + SourceLocation EndLoc = getSourceLocation(TokenPtr); + SourceRange FullRange = SourceRange(Loc, EndLoc); + SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); + Diag(Loc, diag::warn_correct_comment_command_name) + << FullRange << CommandName << CorrectedName + << FixItHint::CreateReplacement(CommandRange, CorrectedName); + } else { + formTokenWithChars(T, TokenPtr, tok::unknown_command); + T.setUnknownCommandName(CommandName); + Diag(T.getLocation(), diag::warn_unknown_comment_command_name) + << SourceRange(T.getLocation(), T.getEndLocation()); + return; + } + } + if (Info->IsVerbatimBlockCommand) { + setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); + return; + } + if (Info->IsVerbatimLineCommand) { + setupAndLexVerbatimLine(T, TokenPtr, Info); + return; + } + formTokenWithChars(T, TokenPtr, CommandKind); + T.setCommandID(Info->getID()); + return; } + + case '&': + lexHTMLCharacterReference(T); + return; + + case '<': { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + const char C = *TokenPtr; + if (isHTMLIdentifierStartingCharacter(C)) + setupAndLexHTMLStartTag(T); + else if (C == '/') + setupAndLexHTMLEndTag(T); + else + formTextToken(T, TokenPtr); + return; + } + + default: + return HandleNonCommandToken(); } } @@ -727,14 +740,13 @@ void Lexer::lexHTMLEndTag(Token &T) { } Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, - const CommandTraits &Traits, - SourceLocation FileLoc, - const char *BufferStart, const char *BufferEnd): - Allocator(Allocator), Diags(Diags), Traits(Traits), - BufferStart(BufferStart), BufferEnd(BufferEnd), - FileLoc(FileLoc), BufferPtr(BufferStart), - CommentState(LCS_BeforeComment), State(LS_Normal) { -} + const CommandTraits &Traits, SourceLocation FileLoc, + const char *BufferStart, const char *BufferEnd, + bool ParseCommands) + : Allocator(Allocator), Diags(Diags), Traits(Traits), + BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), + BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal), + ParseCommands(ParseCommands) {} void Lexer::lex(Token &T) { again: diff --git a/clang/lib/AST/RawCommentList.cpp b/clang/lib/AST/RawCommentList.cpp index d6a640b7dc2d..95da9ed6d238 100644 --- a/clang/lib/AST/RawCommentList.cpp +++ b/clang/lib/AST/RawCommentList.cpp @@ -335,3 +335,94 @@ void RawCommentList::addDeserializedComments(ArrayRef Deserialized BeforeThanCompare(SourceMgr)); std::swap(Comments, MergedComments); } + +std::string RawComment::getFormattedText(const SourceManager &SourceMgr, + DiagnosticsEngine &Diags) const { + llvm::StringRef CommentText = getRawText(SourceMgr); + if (CommentText.empty()) + return ""; + + llvm::BumpPtrAllocator Allocator; + // We do not parse any commands, so CommentOptions are ignored by + // comments::Lexer. Therefore, we just use default-constructed options. + CommentOptions DefOpts; + comments::CommandTraits EmptyTraits(Allocator, DefOpts); + comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(), + CommentText.begin(), CommentText.end(), + /*ParseCommands=*/false); + + std::string Result; + // A column number of the first non-whitespace token in the comment text. + // We skip whitespace up to this column, but keep the whitespace after this + // column. IndentColumn is calculated when lexing the first line and reused + // for the rest of lines. + unsigned IndentColumn = 0; + + // Processes one line of the comment and adds it to the result. + // Handles skipping the indent at the start of the line. + // Returns false when eof is reached and true otherwise. + auto LexLine = [&](bool IsFirstLine) -> bool { + comments::Token Tok; + // Lex the first token on the line. We handle it separately, because we to + // fix up its indentation. + L.lex(Tok); + if (Tok.is(comments::tok::eof)) + return false; + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr); + bool LocInvalid = false; + unsigned TokColumn = + SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid); + assert(!LocInvalid && "getFormattedText for invalid location"); + + // Amount of leading whitespace in TokText. + size_t WhitespaceLen = TokText.find_first_not_of(" \t"); + if (WhitespaceLen == StringRef::npos) + WhitespaceLen = TokText.size(); + // Remember the amount of whitespace we skipped in the first line to remove + // indent up to that column in the following lines. + if (IsFirstLine) + IndentColumn = TokColumn + WhitespaceLen; + + // Amount of leading whitespace we actually want to skip. + // For the first line we skip all the whitespace. + // For the rest of the lines, we skip whitespace up to IndentColumn. + unsigned SkipLen = + IsFirstLine + ? WhitespaceLen + : std::min( + WhitespaceLen, + std::max(static_cast(IndentColumn) - TokColumn, 0)); + llvm::StringRef Trimmed = TokText.drop_front(SkipLen); + Result += Trimmed; + // Lex all tokens in the rest of the line. + for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) { + if (Tok.is(comments::tok::newline)) { + Result += "\n"; + return true; + } + Result += L.getSpelling(Tok, SourceMgr); + } + // We've reached the end of file token. + return false; + }; + + auto DropTrailingNewLines = [](std::string &Str) { + while (Str.back() == '\n') + Str.pop_back(); + }; + + // Proces first line separately to remember indent for the following lines. + if (!LexLine(/*IsFirstLine=*/true)) { + DropTrailingNewLines(Result); + return Result; + } + // Process the rest of the lines. + while (LexLine(/*IsFirstLine=*/false)) + ; + DropTrailingNewLines(Result); + return Result; +} diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt index 89590c110668..e1c2ad59792e 100644 --- a/clang/unittests/AST/CMakeLists.txt +++ b/clang/unittests/AST/CMakeLists.txt @@ -9,6 +9,7 @@ add_clang_unittest(ASTTests ASTVectorTest.cpp CommentLexer.cpp CommentParser.cpp + CommentTextTest.cpp DataCollectionTest.cpp DeclPrinterTest.cpp DeclTest.cpp diff --git a/clang/unittests/AST/CommentTextTest.cpp b/clang/unittests/AST/CommentTextTest.cpp new file mode 100644 index 000000000000..05003b599ea6 --- /dev/null +++ b/clang/unittests/AST/CommentTextTest.cpp @@ -0,0 +1,122 @@ +//===- unittest/AST/CommentTextTest.cpp - Comment text extraction test ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tests for user-friendly output formatting of comments, i.e. +// RawComment::getFormattedText(). +// +//===----------------------------------------------------------------------===// + +#include "clang/AST/RawCommentList.h" +#include "clang/Basic/CommentOptions.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticIDs.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/FileSystemOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/VirtualFileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include + +namespace clang { + +class CommentTextTest : public ::testing::Test { +protected: + std::string formatComment(llvm::StringRef CommentText) { + SourceManagerForFile FileSourceMgr("comment-test.cpp", CommentText); + SourceManager& SourceMgr = FileSourceMgr.get(); + + auto CommentStartOffset = CommentText.find("/"); + assert(CommentStartOffset != llvm::StringRef::npos); + FileID File = SourceMgr.getMainFileID(); + + SourceRange CommentRange( + SourceMgr.getLocForStartOfFile(File).getLocWithOffset( + CommentStartOffset), + SourceMgr.getLocForEndOfFile(File)); + CommentOptions EmptyOpts; + // FIXME: technically, merged that we set here is incorrect, but that + // shouldn't matter. + RawComment Comment(SourceMgr, CommentRange, EmptyOpts, /*Merged=*/true); + DiagnosticsEngine Diags(new DiagnosticIDs, new DiagnosticOptions); + return Comment.getFormattedText(SourceMgr, Diags); + } +}; + +TEST_F(CommentTextTest, FormattedText) { + // clang-format off + auto ExpectedOutput = +R"(This function does this and that. +For example, + Runnning it in that case will give you + this result. +That's about it.)"; + // Two-slash comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +// This function does this and that. +// For example, +// Runnning it in that case will give you +// this result. +// That's about it.)cpp")); + + // Three-slash comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/// This function does this and that. +/// For example, +/// Runnning it in that case will give you +/// this result. +/// That's about it.)cpp")); + + // Block comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/* This function does this and that. + * For example, + * Runnning it in that case will give you + * this result. + * That's about it.*/)cpp")); + + // Doxygen-style block comments. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/** This function does this and that. + * For example, + * Runnning it in that case will give you + * this result. + * That's about it.*/)cpp")); + + // Weird indentation. + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( + // This function does this and that. + // For example, + // Runnning it in that case will give you + // this result. + // That's about it.)cpp")); + // clang-format on +} + +TEST_F(CommentTextTest, KeepsDoxygenControlSeqs) { + // clang-format off + auto ExpectedOutput = +R"(\brief This is the brief part of the comment. +\param a something about a. +@param b something about b.)"; + + EXPECT_EQ(ExpectedOutput, formatComment( +R"cpp( +/// \brief This is the brief part of the comment. +/// \param a something about a. +/// @param b something about b.)cpp")); + // clang-format on +} + +} // namespace clang