transformers/utils/style_doc.py

551 lines
22 KiB
Python

# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Style utils for the .rst and the docstrings."""
import argparse
import os
import re
import warnings
from enum import Enum
# Special blocks where the inside should be formatted.
TEXTUAL_BLOCKS = ["note", "warning"]
# List of acceptable characters for titles and sections underline.
TITLE_SPECIAL_CHARS = """= - ` : ' " ~ ^ _ * + # < >""".split(" ")
# Special words for docstrings (s? means the s is optional)
DOC_SPECIAL_WORD = [
"Args?",
"Params?",
"Parameters?",
"Arguments?",
"Examples?",
"Usage",
"Returns?",
"Raises?",
"Attributes?",
]
# Regexes
# Matches any declaration of textual block, like `.. note::`. (ignore case to avoid writing all versions in the list)
_re_textual_blocks = re.compile(r"^\s*\.\.\s+(" + "|".join(TEXTUAL_BLOCKS) + r")\s*::\s*$", re.IGNORECASE)
# Matches list introduction in rst.
_re_list = re.compile(r"^(\s*-\s+|\s*\*\s+|\s*\d+\.\s+)")
# Matches the indent in a line.
_re_indent = re.compile(r"^(\s*)\S")
# Matches a table declaration in rst.
_re_table = re.compile(r"(\+-+)+\+\s*$")
# Matches a code block in rst `:: `.
_re_code_block = re.compile(r"^\s*::\s*$")
_re_code_block_explicit = re.compile(r"^\.\.\s+code\-block::")
# Matches any block of the form `.. something::` or `.. something:: bla`.
_re_ignore = re.compile(r"^\s*\.\.\s+(.*?)\s*::\s*\S*\s*$")
# Matches comment introduction in rst.
_re_comment = re.compile(r"\s*\.\.\s*$")
# Matches the special tag to ignore some paragraphs.
_re_doc_ignore = re.compile(r"(\.\.|#)\s*docstyle-ignore")
# Matches the example introduction in docstrings.
_re_example = re.compile(r"::\s*$")
# Matches the parameters introduction in docstrings.
_re_arg_def = re.compile(r"^\s*(Args?|Parameters?|Params|Arguments?|Environment|Attributes?)\s*:\s*$")
# Matches the return introduction in docstrings.
_re_return = re.compile(r"^\s*(Returns?|Raises?|Note)\s*:\s*$")
# Matches any doc special word.
_re_any_doc_special_word = re.compile(r"^\s*(" + "|".join(DOC_SPECIAL_WORD) + r")::?\s*$")
class SpecialBlock(Enum):
NOT_SPECIAL = 0
NO_STYLE = 1
ARG_LIST = 2
def split_text_in_lines(text, max_len, prefix="", min_indent=None):
"""
Split `text` in the biggest lines possible with the constraint of `max_len` using `prefix` on the first line and
then indenting with the same length as `prefix`.
"""
text = re.sub(r"\s+", " ", text)
indent = " " * len(prefix)
if min_indent is not None:
if len(indent) < len(min_indent):
indent = min_indent
if len(prefix) < len(min_indent):
prefix = " " * (len(min_indent) - len(prefix)) + prefix
new_lines = []
words = text.split(" ")
current_line = f"{prefix}{words[0]}"
for word in words[1:]:
try_line = f"{current_line} {word}"
if len(try_line) > max_len:
new_lines.append(current_line)
current_line = f"{indent}{word}"
else:
current_line = try_line
new_lines.append(current_line)
return "\n".join(new_lines)
def get_indent(line):
"""Get the indentation of `line`."""
indent_search = _re_indent.search(line)
return indent_search.groups()[0] if indent_search is not None else ""
class CodeStyler:
"""A generic class to style .rst files."""
def is_no_style_block(self, line):
"""Whether or not `line` introduces a block where styling should be ignore"""
if _re_code_block.search(line) is not None:
return True
if _re_textual_blocks.search(line) is not None:
return False
return _re_ignore.search(line) is not None
def is_comment_or_textual_block(self, line):
"""Whether or not `line` introduces a block where styling should not be ignored (note, warnings...)"""
if _re_comment.search(line):
return True
return _re_textual_blocks.search(line) is not None
def is_special_block(self, line):
"""Whether or not `line` introduces a special block."""
if self.is_no_style_block(line):
self.in_block = SpecialBlock.NO_STYLE
return True
return False
def init_in_block(self, text):
"""
Returns the initial value for `self.in_block`.
Useful for some docstrings beginning inside an argument declaration block (all models).
"""
return SpecialBlock.NOT_SPECIAL
def end_of_special_style(self, line):
"""
Sets back the `in_block` attribute to `NOT_SPECIAL`.
Useful for some docstrings where we may have to go back to `ARG_LIST` instead.
"""
self.in_block = SpecialBlock.NOT_SPECIAL
def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None):
"""
Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag
is passed.
"""
if len(paragraph) == 0:
return ""
if no_style or self.in_block == SpecialBlock.NO_STYLE:
return "\n".join(paragraph)
if _re_list.search(paragraph[0]) is not None:
# Great, we're in a list. So we need to split our paragraphs in smaller parts, one for each item.
result = ""
remainder = ""
prefix = _re_list.search(paragraph[0]).groups()[0]
prefix_indent = get_indent(paragraph[0])
current_item = [paragraph[0][len(prefix) :]]
for i, line in enumerate(paragraph[1:]):
new_item_search = _re_list.search(line)
indent = get_indent(line)
if len(indent) < len(prefix_indent) or (len(indent) == len(prefix_indent) and new_item_search is None):
# There might not be an empty line after the list, formatting the remainder recursively.
remainder = "\n" + self.style_paragraph(
paragraph[i + 1 :], max_len, no_style=no_style, min_indent=min_indent
)
break
elif new_item_search is not None:
text = " ".join([l.strip() for l in current_item])
result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent) + "\n"
prefix = new_item_search.groups()[0]
prefix_indent = indent
current_item = [line[len(prefix) :]]
else:
current_item.append(line)
# Treat the last item
text = " ".join([l.strip() for l in current_item])
result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent)
# Add the potential remainder
return result + remainder
if len(paragraph) > 1 and self.is_comment_or_textual_block(paragraph[0]):
# Comments/notes in rst should be restyled with indentation, ignoring the first line.
indent = get_indent(paragraph[1])
text = " ".join([l.strip() for l in paragraph[1:]])
return paragraph[0] + "\n" + split_text_in_lines(text, max_len, indent, min_indent=min_indent)
if self.in_block == SpecialBlock.ARG_LIST:
# Arg lists are special: we need to ignore the lines that are at the first indentation level beneath the
# Args/Parameters (parameter description), then we can style the indentation level beneath.
result = ""
# The args/parameters could be in that paragraph and should be ignored
if _re_arg_def.search(paragraph[0]) is not None:
if len(paragraph) == 1:
return paragraph[0]
result += paragraph[0] + "\n"
paragraph = paragraph[1:]
if self.current_indent is None:
self.current_indent = get_indent(paragraph[1])
current_item = []
for line in paragraph:
if get_indent(line) == self.current_indent:
if len(current_item) > 0:
item_indent = get_indent(current_item[0])
text = " ".join([l.strip() for l in current_item])
result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
result += line + "\n"
current_item = []
else:
current_item.append(line)
if len(current_item) > 0:
item_indent = get_indent(current_item[0])
text = " ".join([l.strip() for l in current_item])
result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
return result[:-1]
indent = get_indent(paragraph[0])
text = " ".join([l.strip() for l in paragraph])
return split_text_in_lines(text, max_len, indent, min_indent=min_indent)
def style(self, text, max_len=119, min_indent=None):
"""Style `text` to `max_len`."""
new_lines = []
paragraph = []
self.current_indent = ""
self.previous_indent = None
# If one of those is True, the paragraph should not be touched (code samples, lists...)
no_style = False
no_style_next = False
self.in_block = self.init_in_block(text)
# If this is True, we force-break a paragraph, even if there is no new empty line.
break_paragraph = False
lines = text.split("\n")
last_line = None
for line in lines:
# New paragraph
line_is_empty = len(line.strip()) == 0
list_begins = (
_re_list.search(line) is not None
and last_line is not None
and len(get_indent(line)) > len(get_indent(last_line))
)
if line_is_empty or break_paragraph or list_begins:
if len(paragraph) > 0:
if self.in_block != SpecialBlock.NOT_SPECIAL:
indent = get_indent(paragraph[0])
# Are we still in a no-style block?
if self.current_indent is None:
# If current_indent is None, we haven't begun the interior of the block so the answer is
# yes, unless we have an indent of 0 in which case the special block took one line only.
if len(indent) == 0:
self.in_block = SpecialBlock.NOT_SPECIAL
else:
self.current_indent = indent
elif not indent.startswith(self.current_indent):
# If not, we are leaving the block when we unindent.
self.end_of_special_style(paragraph[0])
if self.is_special_block(paragraph[0]):
# Maybe we are starting a special block.
if len(paragraph) > 1:
# If we have the interior of the block in the paragraph, we grab the indent.
self.current_indent = get_indent(paragraph[1])
else:
# We will determine the indent with the next paragraph
self.current_indent = None
styled_paragraph = self.style_paragraph(
paragraph, max_len, no_style=no_style, min_indent=min_indent
)
new_lines.append(styled_paragraph + "\n")
else:
new_lines.append("")
paragraph = []
no_style = no_style_next
no_style_next = False
last_line = None
if (not break_paragraph and not list_begins) or line_is_empty:
break_paragraph = False
continue
break_paragraph = False
# Title and section lines should go to the max + add a new paragraph.
if (
len(set(line)) == 1
and line[0] in TITLE_SPECIAL_CHARS
and last_line is not None
and len(line) >= len(last_line)
):
line = line[0] * max_len
break_paragraph = True
# proper doc comment indicates the next paragraph should be no-style.
if _re_doc_ignore.search(line) is not None:
no_style_next = True
# Table are in just one paragraph and should be no-style.
if _re_table.search(line) is not None:
no_style = True
paragraph.append(line)
last_line = line
# Just have to treat the last paragraph. It could still be in a no-style block (or not)
if len(paragraph) > 0:
# Are we still in a special block
# (if current_indent is None, we are but no need to set it since we are the end.)
if self.in_block != SpecialBlock.NO_STYLE and self.current_indent is not None:
indent = get_indent(paragraph[0])
if not indent.startswith(self.current_indent):
self.in_block = SpecialBlock.NOT_SPECIAL
_ = self.is_special_block(paragraph[0])
new_lines.append(self.style_paragraph(paragraph, max_len, no_style=no_style, min_indent=min_indent) + "\n")
return "\n".join(new_lines)
class DocstringStyler(CodeStyler):
"""Class to style docstrings that take the main method from `CodeStyler`."""
def is_no_style_block(self, line):
if _re_textual_blocks.search(line) is not None:
return False
if _re_example.search(line) is not None:
return True
return _re_code_block.search(line) is not None
def is_comment_or_textual_block(self, line):
if _re_return.search(line) is not None:
self.in_block = SpecialBlock.NOT_SPECIAL
return True
return super().is_comment_or_textual_block(line)
def is_special_block(self, line):
if self.is_no_style_block(line):
if self.previous_indent is None and self.in_block == SpecialBlock.ARG_LIST:
self.previous_indent = self.current_indent
self.in_block = SpecialBlock.NO_STYLE
return True
if _re_arg_def.search(line) is not None:
self.in_block = SpecialBlock.ARG_LIST
return True
return False
def end_of_special_style(self, line):
if self.previous_indent is not None and line.startswith(self.previous_indent):
self.in_block = SpecialBlock.ARG_LIST
self.current_indent = self.previous_indent
else:
self.in_block = SpecialBlock.NOT_SPECIAL
self.previous_indent = None
def init_in_block(self, text):
lines = text.split("\n")
while len(lines) > 0 and len(lines[0]) == 0:
lines = lines[1:]
if len(lines) == 0:
return SpecialBlock.NOT_SPECIAL
if re.search(r":\s*$", lines[0]):
indent = get_indent(lines[0])
if (
len(lines) == 1
or len(get_indent(lines[1])) > len(indent)
or (len(get_indent(lines[1])) == len(indent) and re.search(r":\s*$", lines[1]))
):
self.current_indent = indent
return SpecialBlock.ARG_LIST
return SpecialBlock.NOT_SPECIAL
rst_styler = CodeStyler()
doc_styler = DocstringStyler()
def _reindent_code_blocks(text):
"""Checks indent in code blocks is of four"""
lines = text.split("\n")
idx = 0
while idx < len(lines):
# Detect if the line is the start of a new code-block.
if _re_code_block.search(lines[idx]) is not None or _re_code_block_explicit.search(lines[idx]) is not None:
while len(get_indent(lines[idx])) == 0:
idx += 1
indent = len(get_indent(lines[idx]))
should_continue = True
while should_continue:
if len(lines[idx]) > 0 and indent < 4:
lines[idx] = " " * 4 + lines[idx][indent:]
idx += 1
should_continue = (idx < len(lines)) and (len(lines[idx]) == 0 or len(get_indent(lines[idx])) > 0)
else:
idx += 1
return "\n".join(lines)
def _add_new_lines_before_list(text):
"""Add a new empty line before a list begins."""
lines = text.split("\n")
new_lines = []
in_list = False
for idx, line in enumerate(lines):
# Detect if the line is the start of a new list.
if _re_list.search(line) is not None and not in_list:
current_indent = get_indent(line)
in_list = True
# If the line before is non empty, add an extra new line.
if idx > 0 and len(lines[idx - 1]) != 0:
new_lines.append("")
# Detect if we're out of the current list.
if in_list and not line.startswith(current_indent) and _re_list.search(line) is None:
in_list = False
new_lines.append(line)
return "\n".join(new_lines)
def _add_new_lines_before_doc_special_words(text):
lines = text.split("\n")
new_lines = []
for idx, line in enumerate(lines):
# Detect if the line is the start of a new list.
if _re_any_doc_special_word.search(line) is not None:
# If the line before is non empty, add an extra new line.
if idx > 0 and len(lines[idx - 1]) != 0:
new_lines.append("")
new_lines.append(line)
return "\n".join(new_lines)
def style_rst_file(doc_file, max_len=119, check_only=False):
"""Style one rst file `doc_file` to `max_len`."""
with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
doc = f.read()
# Make sure code blocks are indented at 4
clean_doc = _reindent_code_blocks(doc)
# Add missing new lines before lists
clean_doc = _add_new_lines_before_list(clean_doc)
# Style
clean_doc = rst_styler.style(clean_doc, max_len=max_len)
diff = clean_doc != doc
if not check_only and diff:
print(f"Overwriting content of {doc_file}.")
with open(doc_file, "w", encoding="utf-8", newline="\n") as f:
f.write(clean_doc)
return diff
def style_docstring(docstring, max_len=119):
"""Style `docstring` to `max_len`."""
# One-line docstring that are not too long are left as is.
if len(docstring) < max_len and "\n" not in docstring:
return docstring
# Grab the indent from the last line
last_line = docstring.split("\n")[-1]
# Is it empty except for the last triple-quotes (not-included in `docstring`)?
indent_search = re.search(r"^(\s*)$", last_line)
if indent_search is not None:
indent = indent_search.groups()[0]
if len(indent) > 0:
docstring = docstring[: -len(indent)]
# Or are the triple quotes next to text (we will fix that).
else:
indent_search = _re_indent.search(last_line)
indent = indent_search.groups()[0] if indent_search is not None else ""
# Add missing new lines before Args/Returns etc.
docstring = _add_new_lines_before_doc_special_words(docstring)
# Add missing new lines before lists
docstring = _add_new_lines_before_list(docstring)
# Style
styled_doc = doc_styler.style(docstring, max_len=max_len, min_indent=indent)
# Add new lines if necessary
if not styled_doc.startswith("\n"):
styled_doc = "\n" + styled_doc
if not styled_doc.endswith("\n"):
styled_doc += "\n"
return styled_doc + indent
def style_file_docstrings(code_file, max_len=119, check_only=False):
"""Style all docstrings in `code_file` to `max_len`."""
with open(code_file, "r", encoding="utf-8", newline="\n") as f:
code = f.read()
# fmt: off
splits = code.split('\"\"\"')
splits = [
(s if i % 2 == 0 or _re_doc_ignore.search(splits[i - 1]) is not None else style_docstring(s, max_len=max_len))
for i, s in enumerate(splits)
]
clean_code = '\"\"\"'.join(splits)
# fmt: on
diff = clean_code != code
if not check_only and diff:
print(f"Overwriting content of {code_file}.")
with open(code_file, "w", encoding="utf-8", newline="\n") as f:
f.write(clean_code)
return diff
def style_doc_files(*files, max_len=119, check_only=False):
"""
Style all `files` to `max_len` and fixes mistakes if not `check_only`, otherwise raises an error if styling should
be done.
"""
changed = []
for file in files:
# Treat folders
if os.path.isdir(file):
files = [os.path.join(file, f) for f in os.listdir(file)]
files = [f for f in files if os.path.isdir(f) or f.endswith(".rst") or f.endswith(".py")]
changed += style_doc_files(*files, max_len=max_len, check_only=check_only)
# Treat rst
elif file.endswith(".rst"):
if style_rst_file(file, max_len=max_len, check_only=check_only):
changed.append(file)
# Treat python files
elif file.endswith(".py"):
if style_file_docstrings(file, max_len=max_len, check_only=check_only):
changed.append(file)
else:
warnings.warn(f"Ignoring {file} because it's not a py or an rst file or a folder.")
return changed
def main(*files, max_len=119, check_only=False):
changed = style_doc_files(*files, max_len=max_len, check_only=check_only)
if check_only and len(changed) > 0:
raise ValueError(f"{len(changed)} files should be restyled!")
elif len(changed) > 0:
print(f"Cleaned {len(changed)} files!")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.")
parser.add_argument("--max_len", type=int, help="The maximum length of lines.")
parser.add_argument("--check_only", action="store_true", help="Whether to only check and not fix styling issues.")
args = parser.parse_args()
main(*args.files, max_len=args.max_len, check_only=args.check_only)