551 lines
22 KiB
Python
551 lines
22 KiB
Python
# coding=utf-8
|
|
# Copyright 2020 The HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Style utils for the .rst and the docstrings."""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import warnings
|
|
from enum import Enum
|
|
|
|
|
|
# Special blocks where the inside should be formatted.
|
|
TEXTUAL_BLOCKS = ["note", "warning"]
|
|
# List of acceptable characters for titles and sections underline.
|
|
TITLE_SPECIAL_CHARS = """= - ` : ' " ~ ^ _ * + # < >""".split(" ")
|
|
# Special words for docstrings (s? means the s is optional)
|
|
DOC_SPECIAL_WORD = [
|
|
"Args?",
|
|
"Params?",
|
|
"Parameters?",
|
|
"Arguments?",
|
|
"Examples?",
|
|
"Usage",
|
|
"Returns?",
|
|
"Raises?",
|
|
"Attributes?",
|
|
]
|
|
|
|
# Regexes
|
|
# Matches any declaration of textual block, like `.. note::`. (ignore case to avoid writing all versions in the list)
|
|
_re_textual_blocks = re.compile(r"^\s*\.\.\s+(" + "|".join(TEXTUAL_BLOCKS) + r")\s*::\s*$", re.IGNORECASE)
|
|
# Matches list introduction in rst.
|
|
_re_list = re.compile(r"^(\s*-\s+|\s*\*\s+|\s*\d+\.\s+)")
|
|
# Matches the indent in a line.
|
|
_re_indent = re.compile(r"^(\s*)\S")
|
|
# Matches a table declaration in rst.
|
|
_re_table = re.compile(r"(\+-+)+\+\s*$")
|
|
# Matches a code block in rst `:: `.
|
|
_re_code_block = re.compile(r"^\s*::\s*$")
|
|
_re_code_block_explicit = re.compile(r"^\.\.\s+code\-block::")
|
|
# Matches any block of the form `.. something::` or `.. something:: bla`.
|
|
_re_ignore = re.compile(r"^\s*\.\.\s+(.*?)\s*::\s*\S*\s*$")
|
|
# Matches comment introduction in rst.
|
|
_re_comment = re.compile(r"\s*\.\.\s*$")
|
|
# Matches the special tag to ignore some paragraphs.
|
|
_re_doc_ignore = re.compile(r"(\.\.|#)\s*docstyle-ignore")
|
|
# Matches the example introduction in docstrings.
|
|
_re_example = re.compile(r"::\s*$")
|
|
# Matches the parameters introduction in docstrings.
|
|
_re_arg_def = re.compile(r"^\s*(Args?|Parameters?|Params|Arguments?|Environment|Attributes?)\s*:\s*$")
|
|
# Matches the return introduction in docstrings.
|
|
_re_return = re.compile(r"^\s*(Returns?|Raises?|Note)\s*:\s*$")
|
|
# Matches any doc special word.
|
|
_re_any_doc_special_word = re.compile(r"^\s*(" + "|".join(DOC_SPECIAL_WORD) + r")::?\s*$")
|
|
|
|
|
|
class SpecialBlock(Enum):
|
|
NOT_SPECIAL = 0
|
|
NO_STYLE = 1
|
|
ARG_LIST = 2
|
|
|
|
|
|
def split_text_in_lines(text, max_len, prefix="", min_indent=None):
|
|
"""
|
|
Split `text` in the biggest lines possible with the constraint of `max_len` using `prefix` on the first line and
|
|
then indenting with the same length as `prefix`.
|
|
"""
|
|
text = re.sub(r"\s+", " ", text)
|
|
indent = " " * len(prefix)
|
|
if min_indent is not None:
|
|
if len(indent) < len(min_indent):
|
|
indent = min_indent
|
|
if len(prefix) < len(min_indent):
|
|
prefix = " " * (len(min_indent) - len(prefix)) + prefix
|
|
new_lines = []
|
|
words = text.split(" ")
|
|
current_line = f"{prefix}{words[0]}"
|
|
for word in words[1:]:
|
|
try_line = f"{current_line} {word}"
|
|
if len(try_line) > max_len:
|
|
new_lines.append(current_line)
|
|
current_line = f"{indent}{word}"
|
|
else:
|
|
current_line = try_line
|
|
new_lines.append(current_line)
|
|
return "\n".join(new_lines)
|
|
|
|
|
|
def get_indent(line):
|
|
"""Get the indentation of `line`."""
|
|
indent_search = _re_indent.search(line)
|
|
return indent_search.groups()[0] if indent_search is not None else ""
|
|
|
|
|
|
class CodeStyler:
|
|
"""A generic class to style .rst files."""
|
|
|
|
def is_no_style_block(self, line):
|
|
"""Whether or not `line` introduces a block where styling should be ignore"""
|
|
if _re_code_block.search(line) is not None:
|
|
return True
|
|
if _re_textual_blocks.search(line) is not None:
|
|
return False
|
|
return _re_ignore.search(line) is not None
|
|
|
|
def is_comment_or_textual_block(self, line):
|
|
"""Whether or not `line` introduces a block where styling should not be ignored (note, warnings...)"""
|
|
if _re_comment.search(line):
|
|
return True
|
|
return _re_textual_blocks.search(line) is not None
|
|
|
|
def is_special_block(self, line):
|
|
"""Whether or not `line` introduces a special block."""
|
|
if self.is_no_style_block(line):
|
|
self.in_block = SpecialBlock.NO_STYLE
|
|
return True
|
|
return False
|
|
|
|
def init_in_block(self, text):
|
|
"""
|
|
Returns the initial value for `self.in_block`.
|
|
|
|
Useful for some docstrings beginning inside an argument declaration block (all models).
|
|
"""
|
|
return SpecialBlock.NOT_SPECIAL
|
|
|
|
def end_of_special_style(self, line):
|
|
"""
|
|
Sets back the `in_block` attribute to `NOT_SPECIAL`.
|
|
|
|
Useful for some docstrings where we may have to go back to `ARG_LIST` instead.
|
|
"""
|
|
self.in_block = SpecialBlock.NOT_SPECIAL
|
|
|
|
def style_paragraph(self, paragraph, max_len, no_style=False, min_indent=None):
|
|
"""
|
|
Style `paragraph` (a list of lines) by making sure no line goes over `max_len`, except if the `no_style` flag
|
|
is passed.
|
|
"""
|
|
if len(paragraph) == 0:
|
|
return ""
|
|
if no_style or self.in_block == SpecialBlock.NO_STYLE:
|
|
return "\n".join(paragraph)
|
|
if _re_list.search(paragraph[0]) is not None:
|
|
# Great, we're in a list. So we need to split our paragraphs in smaller parts, one for each item.
|
|
result = ""
|
|
remainder = ""
|
|
prefix = _re_list.search(paragraph[0]).groups()[0]
|
|
prefix_indent = get_indent(paragraph[0])
|
|
current_item = [paragraph[0][len(prefix) :]]
|
|
for i, line in enumerate(paragraph[1:]):
|
|
new_item_search = _re_list.search(line)
|
|
indent = get_indent(line)
|
|
if len(indent) < len(prefix_indent) or (len(indent) == len(prefix_indent) and new_item_search is None):
|
|
# There might not be an empty line after the list, formatting the remainder recursively.
|
|
remainder = "\n" + self.style_paragraph(
|
|
paragraph[i + 1 :], max_len, no_style=no_style, min_indent=min_indent
|
|
)
|
|
break
|
|
elif new_item_search is not None:
|
|
text = " ".join([l.strip() for l in current_item])
|
|
result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent) + "\n"
|
|
prefix = new_item_search.groups()[0]
|
|
prefix_indent = indent
|
|
current_item = [line[len(prefix) :]]
|
|
else:
|
|
current_item.append(line)
|
|
# Treat the last item
|
|
text = " ".join([l.strip() for l in current_item])
|
|
result += split_text_in_lines(text, max_len, prefix, min_indent=min_indent)
|
|
# Add the potential remainder
|
|
return result + remainder
|
|
|
|
if len(paragraph) > 1 and self.is_comment_or_textual_block(paragraph[0]):
|
|
# Comments/notes in rst should be restyled with indentation, ignoring the first line.
|
|
indent = get_indent(paragraph[1])
|
|
text = " ".join([l.strip() for l in paragraph[1:]])
|
|
return paragraph[0] + "\n" + split_text_in_lines(text, max_len, indent, min_indent=min_indent)
|
|
|
|
if self.in_block == SpecialBlock.ARG_LIST:
|
|
# Arg lists are special: we need to ignore the lines that are at the first indentation level beneath the
|
|
# Args/Parameters (parameter description), then we can style the indentation level beneath.
|
|
result = ""
|
|
# The args/parameters could be in that paragraph and should be ignored
|
|
if _re_arg_def.search(paragraph[0]) is not None:
|
|
if len(paragraph) == 1:
|
|
return paragraph[0]
|
|
result += paragraph[0] + "\n"
|
|
paragraph = paragraph[1:]
|
|
|
|
if self.current_indent is None:
|
|
self.current_indent = get_indent(paragraph[1])
|
|
|
|
current_item = []
|
|
for line in paragraph:
|
|
if get_indent(line) == self.current_indent:
|
|
if len(current_item) > 0:
|
|
item_indent = get_indent(current_item[0])
|
|
text = " ".join([l.strip() for l in current_item])
|
|
result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
|
|
result += line + "\n"
|
|
current_item = []
|
|
else:
|
|
current_item.append(line)
|
|
if len(current_item) > 0:
|
|
item_indent = get_indent(current_item[0])
|
|
text = " ".join([l.strip() for l in current_item])
|
|
result += split_text_in_lines(text, max_len, item_indent, min_indent=min_indent) + "\n"
|
|
return result[:-1]
|
|
|
|
indent = get_indent(paragraph[0])
|
|
text = " ".join([l.strip() for l in paragraph])
|
|
return split_text_in_lines(text, max_len, indent, min_indent=min_indent)
|
|
|
|
def style(self, text, max_len=119, min_indent=None):
|
|
"""Style `text` to `max_len`."""
|
|
new_lines = []
|
|
paragraph = []
|
|
self.current_indent = ""
|
|
self.previous_indent = None
|
|
# If one of those is True, the paragraph should not be touched (code samples, lists...)
|
|
no_style = False
|
|
no_style_next = False
|
|
self.in_block = self.init_in_block(text)
|
|
# If this is True, we force-break a paragraph, even if there is no new empty line.
|
|
break_paragraph = False
|
|
|
|
lines = text.split("\n")
|
|
last_line = None
|
|
for line in lines:
|
|
# New paragraph
|
|
line_is_empty = len(line.strip()) == 0
|
|
list_begins = (
|
|
_re_list.search(line) is not None
|
|
and last_line is not None
|
|
and len(get_indent(line)) > len(get_indent(last_line))
|
|
)
|
|
if line_is_empty or break_paragraph or list_begins:
|
|
if len(paragraph) > 0:
|
|
if self.in_block != SpecialBlock.NOT_SPECIAL:
|
|
indent = get_indent(paragraph[0])
|
|
# Are we still in a no-style block?
|
|
if self.current_indent is None:
|
|
# If current_indent is None, we haven't begun the interior of the block so the answer is
|
|
# yes, unless we have an indent of 0 in which case the special block took one line only.
|
|
if len(indent) == 0:
|
|
self.in_block = SpecialBlock.NOT_SPECIAL
|
|
else:
|
|
self.current_indent = indent
|
|
elif not indent.startswith(self.current_indent):
|
|
# If not, we are leaving the block when we unindent.
|
|
self.end_of_special_style(paragraph[0])
|
|
|
|
if self.is_special_block(paragraph[0]):
|
|
# Maybe we are starting a special block.
|
|
if len(paragraph) > 1:
|
|
# If we have the interior of the block in the paragraph, we grab the indent.
|
|
self.current_indent = get_indent(paragraph[1])
|
|
else:
|
|
# We will determine the indent with the next paragraph
|
|
self.current_indent = None
|
|
styled_paragraph = self.style_paragraph(
|
|
paragraph, max_len, no_style=no_style, min_indent=min_indent
|
|
)
|
|
new_lines.append(styled_paragraph + "\n")
|
|
else:
|
|
new_lines.append("")
|
|
|
|
paragraph = []
|
|
no_style = no_style_next
|
|
no_style_next = False
|
|
last_line = None
|
|
if (not break_paragraph and not list_begins) or line_is_empty:
|
|
break_paragraph = False
|
|
continue
|
|
break_paragraph = False
|
|
|
|
# Title and section lines should go to the max + add a new paragraph.
|
|
if (
|
|
len(set(line)) == 1
|
|
and line[0] in TITLE_SPECIAL_CHARS
|
|
and last_line is not None
|
|
and len(line) >= len(last_line)
|
|
):
|
|
line = line[0] * max_len
|
|
break_paragraph = True
|
|
# proper doc comment indicates the next paragraph should be no-style.
|
|
if _re_doc_ignore.search(line) is not None:
|
|
no_style_next = True
|
|
# Table are in just one paragraph and should be no-style.
|
|
if _re_table.search(line) is not None:
|
|
no_style = True
|
|
paragraph.append(line)
|
|
last_line = line
|
|
|
|
# Just have to treat the last paragraph. It could still be in a no-style block (or not)
|
|
if len(paragraph) > 0:
|
|
# Are we still in a special block
|
|
# (if current_indent is None, we are but no need to set it since we are the end.)
|
|
if self.in_block != SpecialBlock.NO_STYLE and self.current_indent is not None:
|
|
indent = get_indent(paragraph[0])
|
|
if not indent.startswith(self.current_indent):
|
|
self.in_block = SpecialBlock.NOT_SPECIAL
|
|
_ = self.is_special_block(paragraph[0])
|
|
new_lines.append(self.style_paragraph(paragraph, max_len, no_style=no_style, min_indent=min_indent) + "\n")
|
|
return "\n".join(new_lines)
|
|
|
|
|
|
class DocstringStyler(CodeStyler):
|
|
"""Class to style docstrings that take the main method from `CodeStyler`."""
|
|
|
|
def is_no_style_block(self, line):
|
|
if _re_textual_blocks.search(line) is not None:
|
|
return False
|
|
if _re_example.search(line) is not None:
|
|
return True
|
|
return _re_code_block.search(line) is not None
|
|
|
|
def is_comment_or_textual_block(self, line):
|
|
if _re_return.search(line) is not None:
|
|
self.in_block = SpecialBlock.NOT_SPECIAL
|
|
return True
|
|
return super().is_comment_or_textual_block(line)
|
|
|
|
def is_special_block(self, line):
|
|
if self.is_no_style_block(line):
|
|
if self.previous_indent is None and self.in_block == SpecialBlock.ARG_LIST:
|
|
self.previous_indent = self.current_indent
|
|
self.in_block = SpecialBlock.NO_STYLE
|
|
return True
|
|
if _re_arg_def.search(line) is not None:
|
|
self.in_block = SpecialBlock.ARG_LIST
|
|
return True
|
|
return False
|
|
|
|
def end_of_special_style(self, line):
|
|
if self.previous_indent is not None and line.startswith(self.previous_indent):
|
|
self.in_block = SpecialBlock.ARG_LIST
|
|
self.current_indent = self.previous_indent
|
|
else:
|
|
self.in_block = SpecialBlock.NOT_SPECIAL
|
|
self.previous_indent = None
|
|
|
|
def init_in_block(self, text):
|
|
lines = text.split("\n")
|
|
while len(lines) > 0 and len(lines[0]) == 0:
|
|
lines = lines[1:]
|
|
if len(lines) == 0:
|
|
return SpecialBlock.NOT_SPECIAL
|
|
if re.search(r":\s*$", lines[0]):
|
|
indent = get_indent(lines[0])
|
|
if (
|
|
len(lines) == 1
|
|
or len(get_indent(lines[1])) > len(indent)
|
|
or (len(get_indent(lines[1])) == len(indent) and re.search(r":\s*$", lines[1]))
|
|
):
|
|
self.current_indent = indent
|
|
return SpecialBlock.ARG_LIST
|
|
return SpecialBlock.NOT_SPECIAL
|
|
|
|
|
|
rst_styler = CodeStyler()
|
|
doc_styler = DocstringStyler()
|
|
|
|
|
|
def _reindent_code_blocks(text):
|
|
"""Checks indent in code blocks is of four"""
|
|
lines = text.split("\n")
|
|
idx = 0
|
|
while idx < len(lines):
|
|
# Detect if the line is the start of a new code-block.
|
|
if _re_code_block.search(lines[idx]) is not None or _re_code_block_explicit.search(lines[idx]) is not None:
|
|
while len(get_indent(lines[idx])) == 0:
|
|
idx += 1
|
|
indent = len(get_indent(lines[idx]))
|
|
should_continue = True
|
|
while should_continue:
|
|
if len(lines[idx]) > 0 and indent < 4:
|
|
lines[idx] = " " * 4 + lines[idx][indent:]
|
|
idx += 1
|
|
should_continue = (idx < len(lines)) and (len(lines[idx]) == 0 or len(get_indent(lines[idx])) > 0)
|
|
else:
|
|
idx += 1
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _add_new_lines_before_list(text):
|
|
"""Add a new empty line before a list begins."""
|
|
lines = text.split("\n")
|
|
new_lines = []
|
|
in_list = False
|
|
for idx, line in enumerate(lines):
|
|
# Detect if the line is the start of a new list.
|
|
if _re_list.search(line) is not None and not in_list:
|
|
current_indent = get_indent(line)
|
|
in_list = True
|
|
# If the line before is non empty, add an extra new line.
|
|
if idx > 0 and len(lines[idx - 1]) != 0:
|
|
new_lines.append("")
|
|
# Detect if we're out of the current list.
|
|
if in_list and not line.startswith(current_indent) and _re_list.search(line) is None:
|
|
in_list = False
|
|
new_lines.append(line)
|
|
return "\n".join(new_lines)
|
|
|
|
|
|
def _add_new_lines_before_doc_special_words(text):
|
|
lines = text.split("\n")
|
|
new_lines = []
|
|
for idx, line in enumerate(lines):
|
|
# Detect if the line is the start of a new list.
|
|
if _re_any_doc_special_word.search(line) is not None:
|
|
# If the line before is non empty, add an extra new line.
|
|
if idx > 0 and len(lines[idx - 1]) != 0:
|
|
new_lines.append("")
|
|
new_lines.append(line)
|
|
return "\n".join(new_lines)
|
|
|
|
|
|
def style_rst_file(doc_file, max_len=119, check_only=False):
|
|
"""Style one rst file `doc_file` to `max_len`."""
|
|
with open(doc_file, "r", encoding="utf-8", newline="\n") as f:
|
|
doc = f.read()
|
|
|
|
# Make sure code blocks are indented at 4
|
|
clean_doc = _reindent_code_blocks(doc)
|
|
# Add missing new lines before lists
|
|
clean_doc = _add_new_lines_before_list(clean_doc)
|
|
# Style
|
|
clean_doc = rst_styler.style(clean_doc, max_len=max_len)
|
|
|
|
diff = clean_doc != doc
|
|
if not check_only and diff:
|
|
print(f"Overwriting content of {doc_file}.")
|
|
with open(doc_file, "w", encoding="utf-8", newline="\n") as f:
|
|
f.write(clean_doc)
|
|
|
|
return diff
|
|
|
|
|
|
def style_docstring(docstring, max_len=119):
|
|
"""Style `docstring` to `max_len`."""
|
|
# One-line docstring that are not too long are left as is.
|
|
if len(docstring) < max_len and "\n" not in docstring:
|
|
return docstring
|
|
|
|
# Grab the indent from the last line
|
|
last_line = docstring.split("\n")[-1]
|
|
# Is it empty except for the last triple-quotes (not-included in `docstring`)?
|
|
indent_search = re.search(r"^(\s*)$", last_line)
|
|
if indent_search is not None:
|
|
indent = indent_search.groups()[0]
|
|
if len(indent) > 0:
|
|
docstring = docstring[: -len(indent)]
|
|
# Or are the triple quotes next to text (we will fix that).
|
|
else:
|
|
indent_search = _re_indent.search(last_line)
|
|
indent = indent_search.groups()[0] if indent_search is not None else ""
|
|
|
|
# Add missing new lines before Args/Returns etc.
|
|
docstring = _add_new_lines_before_doc_special_words(docstring)
|
|
# Add missing new lines before lists
|
|
docstring = _add_new_lines_before_list(docstring)
|
|
# Style
|
|
styled_doc = doc_styler.style(docstring, max_len=max_len, min_indent=indent)
|
|
|
|
# Add new lines if necessary
|
|
if not styled_doc.startswith("\n"):
|
|
styled_doc = "\n" + styled_doc
|
|
if not styled_doc.endswith("\n"):
|
|
styled_doc += "\n"
|
|
return styled_doc + indent
|
|
|
|
|
|
def style_file_docstrings(code_file, max_len=119, check_only=False):
|
|
"""Style all docstrings in `code_file` to `max_len`."""
|
|
with open(code_file, "r", encoding="utf-8", newline="\n") as f:
|
|
code = f.read()
|
|
# fmt: off
|
|
splits = code.split('\"\"\"')
|
|
splits = [
|
|
(s if i % 2 == 0 or _re_doc_ignore.search(splits[i - 1]) is not None else style_docstring(s, max_len=max_len))
|
|
for i, s in enumerate(splits)
|
|
]
|
|
clean_code = '\"\"\"'.join(splits)
|
|
# fmt: on
|
|
|
|
diff = clean_code != code
|
|
if not check_only and diff:
|
|
print(f"Overwriting content of {code_file}.")
|
|
with open(code_file, "w", encoding="utf-8", newline="\n") as f:
|
|
f.write(clean_code)
|
|
|
|
return diff
|
|
|
|
|
|
def style_doc_files(*files, max_len=119, check_only=False):
|
|
"""
|
|
Style all `files` to `max_len` and fixes mistakes if not `check_only`, otherwise raises an error if styling should
|
|
be done.
|
|
"""
|
|
changed = []
|
|
for file in files:
|
|
# Treat folders
|
|
if os.path.isdir(file):
|
|
files = [os.path.join(file, f) for f in os.listdir(file)]
|
|
files = [f for f in files if os.path.isdir(f) or f.endswith(".rst") or f.endswith(".py")]
|
|
changed += style_doc_files(*files, max_len=max_len, check_only=check_only)
|
|
# Treat rst
|
|
elif file.endswith(".rst"):
|
|
if style_rst_file(file, max_len=max_len, check_only=check_only):
|
|
changed.append(file)
|
|
# Treat python files
|
|
elif file.endswith(".py"):
|
|
if style_file_docstrings(file, max_len=max_len, check_only=check_only):
|
|
changed.append(file)
|
|
else:
|
|
warnings.warn(f"Ignoring {file} because it's not a py or an rst file or a folder.")
|
|
return changed
|
|
|
|
|
|
def main(*files, max_len=119, check_only=False):
|
|
changed = style_doc_files(*files, max_len=max_len, check_only=check_only)
|
|
if check_only and len(changed) > 0:
|
|
raise ValueError(f"{len(changed)} files should be restyled!")
|
|
elif len(changed) > 0:
|
|
print(f"Cleaned {len(changed)} files!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("files", nargs="+", help="The file(s) or folder(s) to restyle.")
|
|
parser.add_argument("--max_len", type=int, help="The maximum length of lines.")
|
|
parser.add_argument("--check_only", action="store_true", help="Whether to only check and not fix styling issues.")
|
|
args = parser.parse_args()
|
|
|
|
main(*args.files, max_len=args.max_len, check_only=args.check_only)
|