From 16edf4d9fdc68d6aa43fc9a99db57a0ea70b638b Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 10 Aug 2023 10:53:22 +0200 Subject: [PATCH] Doc checks (#25408) * Document check_dummies * Type hints and doc in other files * Document check inits * Add documentation to * Address review comments --- utils/check_copies.py | 139 +++++++++++--- utils/check_doc_toc.py | 5 +- utils/check_doctest_list.py | 11 +- utils/check_dummies.py | 92 +++++++-- utils/check_inits.py | 78 +++++++- utils/check_repo.py | 358 ++++++++++++++++++------------------ 6 files changed, 459 insertions(+), 224 deletions(-) diff --git a/utils/check_copies.py b/utils/check_copies.py index 0352b6419e..563f88a5ec 100644 --- a/utils/check_copies.py +++ b/utils/check_copies.py @@ -40,6 +40,7 @@ import argparse import glob import os import re +from typing import List, Optional, Tuple import black from doc_builder.style_doc import style_docstrings_in_code @@ -125,14 +126,22 @@ LOCALIZED_READMES = { transformers_module = direct_transformers_import(TRANSFORMERS_PATH) -def _should_continue(line, indent): +def _should_continue(line: str, indent: str) -> bool: # Helper function. Returns `True` if `line` is empty, starts with the `indent` or is the end parenthesis of a # function definition return line.startswith(indent) or len(line.strip()) == 0 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None -def find_code_in_transformers(object_name): - """Find and return the code source code of `object_name`.""" +def find_code_in_transformers(object_name: str) -> str: + """ + Find and return the source code of an object. + + Args: + object_name (`str`): The name of the object we want the source code of. + + Returns: + `str`: The source code of the object. + """ parts = object_name.split(".") i = 0 @@ -181,7 +190,16 @@ _re_replace_pattern = re.compile(r"^\s*(\S+)->(\S+)(\s+.*|$)") _re_fill_pattern = re.compile(r"]*>") -def get_indent(code): +def get_indent(code: str) -> str: + """ + Find the indent in the first non empty line in a code sample. + + Args: + code (`str`): The code to inspect. + + Returns: + `str`: The indent looked at (as string). + """ lines = code.split("\n") idx = 0 while idx < len(lines) and len(lines[idx]) == 0: @@ -191,9 +209,15 @@ def get_indent(code): return "" -def blackify(code): +def blackify(code: str) -> str: """ - Applies the black part of our `make style` command to `code`. + Applies the black part of our `make style` command to some code. + + Args: + code (`str`): The code to format. + + Returns: + `str`: The formatted code. """ has_indent = len(get_indent(code)) > 0 if has_indent: @@ -204,14 +228,22 @@ def blackify(code): return result[len("class Bla:\n") :] if has_indent else result -def check_codes_match(observed_code, theoretical_code): +def check_codes_match(observed_code: str, theoretical_code: str) -> Optional[int]: """ - Checks if the code in `observed_code` and `theoretical_code` match with the exception of the class/function name. - Returns the index of the first line where there is a difference (if any) and `None` if the codes match. + Checks if two version of a code match with the exception of the class/function name. + + Args: + observed_code (`str`): The code found. + theoretical_code (`str`): The code to match. + + Returns: + `Optional[int]`: The index of the first line where there is a difference (if any) and `None` if the codes + match. """ observed_code_header = observed_code.split("\n")[0] theoretical_code_header = theoretical_code.split("\n")[0] + # Catch the function/class name: it is expected that those do not match. _re_class_match = re.compile(r"class\s+([^\(:]+)(?:\(|:)") _re_func_match = re.compile(r"def\s+([^\(]+)\(") for re_pattern in [_re_class_match, _re_func_match]: @@ -220,6 +252,7 @@ def check_codes_match(observed_code, theoretical_code): theoretical_name = re_pattern.search(theoretical_code_header).groups()[0] theoretical_code_header = theoretical_code_header.replace(theoretical_name, observed_obj_name) + # Find the first diff. Line 0 is special since we need to compare with the function/class names ignored. diff_index = 0 if theoretical_code_header != observed_code_header: return 0 @@ -231,11 +264,19 @@ def check_codes_match(observed_code, theoretical_code): diff_index += 1 -def is_copy_consistent(filename, overwrite=False): +def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[Tuple[str, int]]]: """ - Check if the code commented as a copy in `filename` matches the original. + Check if the code commented as a copy in a file matches the original. - Return the differences or overwrites the content depending on `overwrite`. + Args: + filename (`str`): + The name of the file to check. + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the copies when they don't match. + + Returns: + `Optional[List[Tuple[str, int]]]`: If `overwrite=False`, returns the list of differences as tuples `(str, int)` + with the name of the object having a diff and the line number where theere is the first diff. """ with open(filename, "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() @@ -308,8 +349,12 @@ def is_copy_consistent(filename, overwrite=False): def check_copies(overwrite: bool = False): """ - Check every file is copy-consistent with the original and maybe `overwrite` content when it is not. Also check the - model list in the main README and other READMEs/index.md are consistent. + Check every file is copy-consistent with the original. Also check the model list in the main README and other + READMEs/index.md are consistent. + + Args: + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the copies when they don't match. """ all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True) diffs = [] @@ -328,8 +373,11 @@ def check_copies(overwrite: bool = False): def check_full_copies(overwrite: bool = False): """ - Check the files that are full copies of others (as indicated in `FULL_COPIES`) are copy-consistent and maybe - `overwrite` to fix issues. + Check the files that are full copies of others (as indicated in `FULL_COPIES`) are copy-consistent. + + Args: + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the copies when they don't match. """ diffs = [] for target, source in FULL_COPIES.items(): @@ -354,8 +402,18 @@ def check_full_copies(overwrite: bool = False): ) -def get_model_list(filename, start_prompt, end_prompt): - """Extracts the model list from a README, between `start_prompt` and `end_prompt`.""" +def get_model_list(filename: str, start_prompt: str, end_prompt: str) -> str: + """ + Extracts the model list from a README. + + Args: + filename (`str`): The name of the README file to check. + start_prompt (`str`): The string to look for that introduces the model list. + end_prompt (`str`): The string to look for that ends the model list. + + Returns: + `str`: The model list. + """ with open(os.path.join(REPO_PATH, filename), "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() # Find the start of the list. @@ -368,6 +426,7 @@ def get_model_list(filename, start_prompt, end_prompt): current_line = "" end_index = start_index + # Keep going until the end of the list. while not lines[end_index].startswith(end_prompt): if lines[end_index].startswith("1."): if len(current_line) > 1: @@ -382,7 +441,7 @@ def get_model_list(filename, start_prompt, end_prompt): return "".join(result) -def convert_to_localized_md(model_list, localized_model_list, format_str): +def convert_to_localized_md(model_list: str, localized_model_list: str, format_str: str) -> Tuple[bool, str]: """ Compare the model list from the main README to the one in a localized README. @@ -458,19 +517,33 @@ def convert_to_localized_md(model_list, localized_model_list, format_str): return readmes_match, "\n".join((x[1] for x in sorted_index)) + "\n" -def convert_readme_to_index(model_list): +def convert_readme_to_index(model_list: str) -> str: """ - Converts the model list of the README to the index.md format. + Converts the model list of the README to the index.md format (adapting links to the doc to relative links). + + Args: + model_list (`str`): The model list of the main README. + + Returns: + `str`: The model list in the format for the index. """ # We need to replce both link to the main doc and stable doc (the order of the next two instructions is important). model_list = model_list.replace("https://huggingface.co/docs/transformers/main/", "") return model_list.replace("https://huggingface.co/docs/transformers/", "") -def _find_text_in_file(filename, start_prompt, end_prompt): +def _find_text_in_file(filename: str, start_prompt: str, end_prompt: str) -> Tuple[str, int, int, List[str]]: """ - Find the text in `filename` between a line beginning with `start_prompt` and before `end_prompt`, removing empty - lines. + Find the text in a file between two prompts. + + Args: + filename (`str`): The name of the file to look into. + start_prompt (`str`): The string to look for that introduces the content looked for. + end_prompt (`str`): The string to look for that ends the content looked for. + + Returns: + Tuple[str, int, int, List[str]]: The content between the two prompts, the index of the start line in the + original file, the index of the end line in the original file and the list of lines of that file. """ with open(filename, "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() @@ -493,9 +566,13 @@ def _find_text_in_file(filename, start_prompt, end_prompt): return "".join(lines[start_index:end_index]), start_index, end_index, lines -def check_model_list_copy(overwrite=False, max_per_line=119): +def check_model_list_copy(overwrite: bool = False): """ Check the model lists in the README is consistent with the ones in the other READMES and also with `index.nmd`. + + Args: + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the copies when they don't match. """ # Fix potential doc links in the README with open(os.path.join(REPO_PATH, "README.md"), "r", encoding="utf-8", newline="\n") as f: @@ -526,6 +603,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119): end_prompt=LOCALIZED_READMES["README.md"]["end_prompt"], ) + # Buld the converted Markdown. converted_md_lists = [] for filename, value in LOCALIZED_READMES.items(): _start_prompt = value["start_prompt"] @@ -537,6 +615,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119): converted_md_lists.append((filename, readmes_match, converted_md_list, _start_prompt, _end_prompt)) + # Build the converted index and compare it. converted_md_list = convert_readme_to_index(md_list) if converted_md_list != index_list: if overwrite: @@ -548,6 +627,7 @@ def check_model_list_copy(overwrite=False, max_per_line=119): "`make fix-copies` to fix this." ) + # Compare the converted Markdowns for converted_md_list in converted_md_lists: filename, readmes_match, converted_md, _start_prompt, _end_prompt = converted_md_list @@ -606,10 +686,13 @@ README_TEMPLATE = ( ) -def check_readme(overwrite=False): +def check_readme(overwrite: bool = False): """ - Check if the main README contains all the models in the library or not. If `overwrite`, will add an entry for the - missing models using `README_TEMPLATE`. + Check if the main README contains all the models in the library or not. + + Args: + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to add an entry for the missing models using `README_TEMPLATE`. """ info = LOCALIZED_READMES["README.md"] models, start_index, end_index, lines = _find_text_in_file( diff --git a/utils/check_doc_toc.py b/utils/check_doc_toc.py index 83c6be4795..ccbff5e0b6 100644 --- a/utils/check_doc_toc.py +++ b/utils/check_doc_toc.py @@ -34,6 +34,7 @@ python utils/check_doc_toc.py --fix_and_overwrite import argparse from collections import defaultdict +from typing import List import yaml @@ -41,7 +42,7 @@ import yaml PATH_TO_TOC = "docs/source/en/_toctree.yml" -def clean_model_doc_toc(model_doc): +def clean_model_doc_toc(model_doc: List[dict]) -> List[dict]: """ Cleans a section of the table of content of the model documentation (one specific modality) by removing duplicates and sorting models alphabetically. @@ -77,7 +78,7 @@ def clean_model_doc_toc(model_doc): return sorted(new_doc, key=lambda s: s["title"].lower()) -def check_model_doc(overwrite=False): +def check_model_doc(overwrite: bool = False): """ Check that the content of the table of content in `_toctree.yml` is clean (no duplicates and sorted for the model API doc) and potentially auto-cleans it. diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py index 3815a2bda0..ee751bc279 100644 --- a/utils/check_doctest_list.py +++ b/utils/check_doctest_list.py @@ -40,7 +40,16 @@ REPO_PATH = "." DOCTEST_FILE_PATHS = ["documentation_tests.txt", "slow_documentation_tests.txt"] -def clean_doctest_list(doctest_file, overwrite=False): +def clean_doctest_list(doctest_file: str, overwrite: bool = False): + """ + Cleans the doctest in a given file. + + Args: + doctest_file (`str`): + The path to the doctest file to check or clean. + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to fix problems. If `False`, will error when the file is not clean. + """ non_existent_paths = [] all_paths = [] with open(doctest_file, "r", encoding="utf-8") as f: diff --git a/utils/check_dummies.py b/utils/check_dummies.py index 39869e87fb..a3ab6ebfa7 100644 --- a/utils/check_dummies.py +++ b/utils/check_dummies.py @@ -12,10 +12,31 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +This script is responsible for making sure the dummies in utils/dummies_xxx.py are up to date with the main init. +Why dummies? This is to make sure that a user can always import all objects from `transformers`, even if they don't +have the necessary extra libs installed. Those objects will then raise helpful error message whenever the user tries +to access one of their methods. + +Usage (from the root of the repo): + +Check that the dummy files are up to date (used in `make repo-consistency`): + +```bash +python utils/check_dummies.py +``` + +Update the dummy files if needed (used in `make fix-copies`): + +```bash +python utils/check_dummies.py --fix_and_overwrite +``` +""" import argparse import os import re +from typing import Dict, List, Optional # All paths are set with the intent you should run this script from the root of the repo with the command @@ -26,13 +47,16 @@ PATH_TO_TRANSFORMERS = "src/transformers" _re_backend = re.compile(r"is\_([a-z_]*)_available()") # Matches from xxx import bla _re_single_line_import = re.compile(r"\s+from\s+\S*\s+import\s+([^\(\s].*)\n") +# Matches if not is_xxx_available() _re_test_backend = re.compile(r"^\s+if\s+not\s+\(?is\_[a-z_]*\_available\(\)") +# Template for the dummy objects. DUMMY_CONSTANT = """ {0} = None """ + DUMMY_CLASS = """ class {0}(metaclass=DummyObject): _backends = {1} @@ -48,8 +72,18 @@ def {0}(*args, **kwargs): """ -def find_backend(line): - """Find one (or multiple) backend in a code line of the init.""" +def find_backend(line: str) -> Optional[str]: + """ + Find one (or multiple) backend in a code line of the init. + + Args: + line (`str`): A code line in an init file. + + Returns: + Optional[`str`]: If one (or several) backend is found, returns it. In the case of multiple backends (the line + contains `if is_xxx_available() and `is_yyy_available()`) returns all backends joined on `_and_` (so + `xxx_and_yyy` for instance). + """ if _re_test_backend.search(line) is None: return None backends = [b[0] for b in _re_backend.findall(line)] @@ -57,8 +91,13 @@ def find_backend(line): return "_and_".join(backends) -def read_init(): - """Read the init and extracts PyTorch, TensorFlow, SentencePiece and Tokenizers objects.""" +def read_init() -> Dict[str, List[str]]: + """ + Read the init and extract backend-specific objects. + + Returns: + Dict[str, List[str]]: A dictionary mapping backend name to the list of object names requiring that backend. + """ with open(os.path.join(PATH_TO_TRANSFORMERS, "__init__.py"), "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() @@ -83,8 +122,10 @@ def read_init(): line = lines[line_index] single_line_import_search = _re_single_line_import.search(line) if single_line_import_search is not None: + # Single-line imports objects.extend(single_line_import_search.groups()[0].split(", ")) elif line.startswith(" " * 12): + # Multiple-line imports (with 3 indent level) objects.append(line[12:-2]) line_index += 1 @@ -95,8 +136,17 @@ def read_init(): return backend_specific_objects -def create_dummy_object(name, backend_name): - """Create the code for the dummy object corresponding to `name`.""" +def create_dummy_object(name: str, backend_name: str) -> str: + """ + Create the code for a dummy object. + + Args: + name (`str`): The name of the object. + backend_name (`str`): The name of the backend required for that object. + + Returns: + `str`: The code of the dummy object. + """ if name.isupper(): return DUMMY_CONSTANT.format(name) elif name.islower(): @@ -105,11 +155,21 @@ def create_dummy_object(name, backend_name): return DUMMY_CLASS.format(name, backend_name) -def create_dummy_files(backend_specific_objects=None): - """Create the content of the dummy files.""" +def create_dummy_files(backend_specific_objects: Optional[Dict[str, List[str]]] = None) -> Dict[str, str]: + """ + Create the content of the dummy files. + + Args: + backend_specific_objects (`Dict[str, List[str]]`, *optional*): + The mapping backend name to list of backend-specific objects. If not passed, will be obtained by calling + `read_init()`. + + Returns: + `Dict[str, str]`: A dictionary mapping backend name to code of the corresponding backend file. + """ if backend_specific_objects is None: backend_specific_objects = read_init() - # For special correspondence backend to module name as used in the function requires_modulename + dummy_files = {} for backend, objects in backend_specific_objects.items(): @@ -122,10 +182,17 @@ def create_dummy_files(backend_specific_objects=None): return dummy_files -def check_dummies(overwrite=False): - """Check if the dummy files are up to date and maybe `overwrite` with the right content.""" +def check_dummies(overwrite: bool = False): + """ + Check if the dummy files are up to date and maybe `overwrite` with the right content. + + Args: + overwrite (`bool`, *optional*, default to `False`): + Whether or not to overwrite the content of the dummy files. Will raise an error if they are not up to date + when `overwrite=False`. + """ dummy_files = create_dummy_files() - # For special correspondence backend to shortcut as used in utils/dummy_xxx_objects.py + # For special correspondence backend name to shortcut as used in utils/dummy_xxx_objects.py short_names = {"torch": "pt"} # Locate actual dummy modules and read their content. @@ -143,6 +210,7 @@ def check_dummies(overwrite=False): else: actual_dummies[backend] = "" + # Compare actual with what they should be. for backend in dummy_files.keys(): if dummy_files[backend] != actual_dummies[backend]: if overwrite: diff --git a/utils/check_inits.py b/utils/check_inits.py index 12b61223e4..43361adbf8 100644 --- a/utils/check_inits.py +++ b/utils/check_inits.py @@ -12,13 +12,37 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility that checks the custom inits of Transformers are well-defined: Transformers uses init files that delay the +import of an object to when it's actually needed. This is to avoid the main init importing all models, which would +make the line `import transformers` very slow when the user has all optional dependencies installed. The inits with +delayed imports have two halves: one definining a dictionary `_import_structure` which maps modules to the name of the +objects in each module, and one in `TYPE_CHECKING` which looks like a normal init for type-checkers. The goal of this +script is to check the objects defined in both halves are the same. + +This also checks the main init properly references all submodules, even if it doesn't import anything from them: every +submodule should be defined as a key of `_import_structure`, with an empty list as value potentially, or the submodule +won't be importable. + +Use from the root of the repo with: + +```bash +python utils/check_inits.py +``` + +for a check that will error in case of inconsistencies (used by `make repo-consistency`). + +There is no auto-fix possible here sadly :-( +""" import collections import os import re from pathlib import Path +from typing import Dict, List, Optional, Tuple +# Path is set with the intent you should run this script from the root of the repo. PATH_TO_TRANSFORMERS = "src/transformers" @@ -46,8 +70,18 @@ _re_try = re.compile(r"^\s*try:") _re_else = re.compile(r"^\s*else:") -def find_backend(line): - """Find one (or multiple) backend in a code line of the init.""" +def find_backend(line: str) -> Optional[str]: + """ + Find one (or multiple) backend in a code line of the init. + + Args: + line (`str`): A code line of the main init. + + Returns: + Optional[`str`]: If one (or several) backend is found, returns it. In the case of multiple backends (the line + contains `if is_xxx_available() and `is_yyy_available()`) returns all backends joined on `_and_` (so + `xxx_and_yyy` for instance). + """ if _re_test_backend.search(line) is None: return None backends = [b[0] for b in _re_backend.findall(line)] @@ -55,14 +89,23 @@ def find_backend(line): return "_and_".join(backends) -def parse_init(init_file): +def parse_init(init_file) -> Optional[Tuple[Dict[str, List[str]], Dict[str, List[str]]]]: """ - Read an init_file and parse (per backend) the _import_structure objects defined and the TYPE_CHECKING objects - defined + Read an init_file and parse (per backend) the `_import_structure` objects defined and the `TYPE_CHECKING` objects + defined. + + Args: + init_file (`str`): Path to the init file to inspect. + + Returns: + `Optional[Tuple[Dict[str, List[str]], Dict[str, List[str]]]]`: A tuple of two dictionaries mapping backends to list of + imported objects, one for the `_import_structure` part of the init and one for the `TYPE_CHECKING` part of the + init. Returns `None` if the init is not a custom init. """ with open(init_file, "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() + # Get the to `_import_structure` definition. line_index = 0 while line_index < len(lines) and not lines[line_index].startswith("_import_structure = {"): line_index += 1 @@ -91,7 +134,9 @@ def parse_init(init_file): objects.append(line[9:-3]) line_index += 1 + # Those are stored with the key "none". import_dict_objects = {"none": objects} + # Let's continue with backend-specific objects in _import_structure while not lines[line_index].startswith("if TYPE_CHECKING"): # If the line is an if not is_backend_available, we grab all objects associated. @@ -151,6 +196,7 @@ def parse_init(init_file): line_index += 1 type_hint_objects = {"none": objects} + # Let's continue with backend-specific objects while line_index < len(lines): # If the line is an if is_backend_available, we grab all objects associated. @@ -186,19 +232,33 @@ def parse_init(init_file): return import_dict_objects, type_hint_objects -def analyze_results(import_dict_objects, type_hint_objects): +def analyze_results(import_dict_objects: Dict[str, List[str]], type_hint_objects: Dict[str, List[str]]) -> List[str]: """ Analyze the differences between _import_structure objects and TYPE_CHECKING objects found in an init. + + Args: + import_dict_objects (`Dict[str, List[str]]`): + A dictionary mapping backend names (`"none"` for the objects independent of any specific backend) to + list of imported objects. + type_hint_objects (`Dict[str, List[str]]`): + A dictionary mapping backend names (`"none"` for the objects independent of any specific backend) to + list of imported objects. + + Returns: + `List[str]`: The list of errors corresponding to mismatches. """ def find_duplicates(seq): return [k for k, v in collections.Counter(seq).items() if v > 1] + # If one backend is missing from the other part of the init, error early. if list(import_dict_objects.keys()) != list(type_hint_objects.keys()): return ["Both sides of the init do not have the same backends!"] errors = [] + # Find all errors. for key in import_dict_objects.keys(): + # Duplicate imports in any half. duplicate_imports = find_duplicates(import_dict_objects[key]) if duplicate_imports: errors.append(f"Duplicate _import_structure definitions for: {duplicate_imports}") @@ -206,6 +266,7 @@ def analyze_results(import_dict_objects, type_hint_objects): if duplicate_type_hints: errors.append(f"Duplicate TYPE_CHECKING objects for: {duplicate_type_hints}") + # Missing imports in either part of the init. if sorted(set(import_dict_objects[key])) != sorted(set(type_hint_objects[key])): name = "base imports" if key == "none" else f"{key} backend" errors.append(f"Differences for {name}:") @@ -237,7 +298,7 @@ def check_all_inits(): raise ValueError("\n\n".join(failures)) -def get_transformers_submodules(): +def get_transformers_submodules() -> List[str]: """ Returns the list of Transformers submodules. """ @@ -272,6 +333,9 @@ IGNORE_SUBMODULES = [ def check_submodules(): + """ + Check all submodules of Transformers are properly registered in the main init. Error otherwise. + """ # This is to make sure the transformers module imported is the one in the repo. from transformers.utils import direct_transformers_import diff --git a/utils/check_repo.py b/utils/check_repo.py index 7af69519c6..8be8469465 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -12,15 +12,34 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Utility that performs several consistency checks on the repo. This includes: +- checking all models are properly defined in the __init__ of models/ +- checking all models are in the main __init__ +- checking all models are properly tested +- checking all object in the main __init__ are documented +- checking all models are in at least one auto class +- checking all the auto mapping are properly defined (no typos, importable) +- checking the list of deprecated models is up to date +Use from the root of the repo with (as used in `make repo-consistency`): + +```bash +python utils/check_repo.py +``` + +It has no auto-fix mode. +""" import inspect import os import re import sys +import types import warnings from collections import OrderedDict from difflib import get_close_matches from pathlib import Path +from typing import List, Tuple from transformers import is_flax_available, is_tf_available, is_torch_available from transformers.models.auto import get_values @@ -60,91 +79,25 @@ PRIVATE_MODELS = [ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ # models to ignore for not tested "InstructBlipQFormerModel", # Building part of bigger (tested) model. - "NllbMoeDecoder", - "NllbMoeEncoder", "UMT5EncoderModel", # Building part of bigger (tested) model. - "LlamaDecoder", # Building part of bigger (tested) model. "Blip2QFormerModel", # Building part of bigger (tested) model. - "DetaEncoder", # Building part of bigger (tested) model. - "DetaDecoder", # Building part of bigger (tested) model. "ErnieMForInformationExtraction", - "GraphormerEncoder", # Building part of bigger (tested) model. "GraphormerDecoderHead", # Building part of bigger (tested) model. - "CLIPSegDecoder", # Building part of bigger (tested) model. - "TableTransformerEncoder", # Building part of bigger (tested) model. - "TableTransformerDecoder", # Building part of bigger (tested) model. - "TimeSeriesTransformerEncoder", # Building part of bigger (tested) model. - "TimeSeriesTransformerDecoder", # Building part of bigger (tested) model. - "InformerEncoder", # Building part of bigger (tested) model. - "InformerDecoder", # Building part of bigger (tested) model. - "AutoformerEncoder", # Building part of bigger (tested) model. - "AutoformerDecoder", # Building part of bigger (tested) model. "JukeboxVQVAE", # Building part of bigger (tested) model. "JukeboxPrior", # Building part of bigger (tested) model. - "DeformableDetrEncoder", # Building part of bigger (tested) model. - "DeformableDetrDecoder", # Building part of bigger (tested) model. - "OPTDecoder", # Building part of bigger (tested) model. - "FlaxWhisperDecoder", # Building part of bigger (tested) model. - "FlaxWhisperEncoder", # Building part of bigger (tested) model. - "WhisperDecoder", # Building part of bigger (tested) model. - "WhisperEncoder", # Building part of bigger (tested) model. "DecisionTransformerGPT2Model", # Building part of bigger (tested) model. "SegformerDecodeHead", # Building part of bigger (tested) model. - "PLBartEncoder", # Building part of bigger (tested) model. - "PLBartDecoder", # Building part of bigger (tested) model. - "PLBartDecoderWrapper", # Building part of bigger (tested) model. - "BigBirdPegasusEncoder", # Building part of bigger (tested) model. - "BigBirdPegasusDecoder", # Building part of bigger (tested) model. - "BigBirdPegasusDecoderWrapper", # Building part of bigger (tested) model. - "DetrEncoder", # Building part of bigger (tested) model. - "DetrDecoder", # Building part of bigger (tested) model. - "DetrDecoderWrapper", # Building part of bigger (tested) model. - "ConditionalDetrEncoder", # Building part of bigger (tested) model. - "ConditionalDetrDecoder", # Building part of bigger (tested) model. - "M2M100Encoder", # Building part of bigger (tested) model. - "M2M100Decoder", # Building part of bigger (tested) model. - "MCTCTEncoder", # Building part of bigger (tested) model. "MgpstrModel", # Building part of bigger (tested) model. - "Speech2TextEncoder", # Building part of bigger (tested) model. - "Speech2TextDecoder", # Building part of bigger (tested) model. - "LEDEncoder", # Building part of bigger (tested) model. - "LEDDecoder", # Building part of bigger (tested) model. - "BartDecoderWrapper", # Building part of bigger (tested) model. - "BartEncoder", # Building part of bigger (tested) model. "BertLMHeadModel", # Needs to be setup as decoder. - "BlenderbotSmallEncoder", # Building part of bigger (tested) model. - "BlenderbotSmallDecoderWrapper", # Building part of bigger (tested) model. - "BlenderbotEncoder", # Building part of bigger (tested) model. - "BlenderbotDecoderWrapper", # Building part of bigger (tested) model. - "MBartEncoder", # Building part of bigger (tested) model. - "MBartDecoderWrapper", # Building part of bigger (tested) model. "MegatronBertLMHeadModel", # Building part of bigger (tested) model. - "MegatronBertEncoder", # Building part of bigger (tested) model. - "MegatronBertDecoder", # Building part of bigger (tested) model. - "MegatronBertDecoderWrapper", # Building part of bigger (tested) model. - "MusicgenDecoder", # Building part of bigger (tested) model. - "MvpDecoderWrapper", # Building part of bigger (tested) model. - "MvpEncoder", # Building part of bigger (tested) model. - "PegasusEncoder", # Building part of bigger (tested) model. - "PegasusDecoderWrapper", # Building part of bigger (tested) model. - "PegasusXEncoder", # Building part of bigger (tested) model. - "PegasusXDecoder", # Building part of bigger (tested) model. - "PegasusXDecoderWrapper", # Building part of bigger (tested) model. - "DPREncoder", # Building part of bigger (tested) model. - "ProphetNetDecoderWrapper", # Building part of bigger (tested) model. "RealmBertModel", # Building part of bigger (tested) model. "RealmReader", # Not regular model. "RealmScorer", # Not regular model. "RealmForOpenQA", # Not regular model. "ReformerForMaskedLM", # Needs to be setup as decoder. - "Speech2Text2DecoderWrapper", # Building part of bigger (tested) model. - "TFDPREncoder", # Building part of bigger (tested) model. "TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?) "TFRobertaForMultipleChoice", # TODO: fix "TFRobertaPreLayerNormForMultipleChoice", # TODO: fix - "TrOCRDecoderWrapper", # Building part of bigger (tested) model. - "TFWhisperEncoder", # Building part of bigger (tested) model. - "TFWhisperDecoder", # Building part of bigger (tested) model. "SeparableConv1D", # Building part of bigger (tested) model. "FlaxBartForCausalLM", # Building part of bigger (tested) model. "FlaxBertForCausalLM", # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM. @@ -155,18 +108,6 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ "TFBlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models "BridgeTowerTextModel", # No need to test it as it is tested by BridgeTowerModel model. "BridgeTowerVisionModel", # No need to test it as it is tested by BridgeTowerModel model. - "SpeechT5Decoder", # Building part of bigger (tested) model. - "SpeechT5DecoderWithoutPrenet", # Building part of bigger (tested) model. - "SpeechT5DecoderWithSpeechPrenet", # Building part of bigger (tested) model. - "SpeechT5DecoderWithTextPrenet", # Building part of bigger (tested) model. - "SpeechT5Encoder", # Building part of bigger (tested) model. - "SpeechT5EncoderWithoutPrenet", # Building part of bigger (tested) model. - "SpeechT5EncoderWithSpeechPrenet", # Building part of bigger (tested) model. - "SpeechT5EncoderWithTextPrenet", # Building part of bigger (tested) model. - "SpeechT5SpeechDecoder", # Building part of bigger (tested) model. - "SpeechT5SpeechEncoder", # Building part of bigger (tested) model. - "SpeechT5TextDecoder", # Building part of bigger (tested) model. - "SpeechT5TextEncoder", # Building part of bigger (tested) model. "BarkCausalModel", # Building part of bigger (tested) model. "BarkModel", # Does not have a forward signature - generation tested with integration tests ] @@ -236,12 +177,6 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "AutoformerForPrediction", "JukeboxVQVAE", "JukeboxPrior", - "PegasusXEncoder", - "PegasusXDecoder", - "PegasusXDecoderWrapper", - "PegasusXEncoder", - "PegasusXDecoder", - "PegasusXDecoderWrapper", "SamModel", "DPTForDepthEstimation", "DecisionTransformerGPT2Model", @@ -250,17 +185,11 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ "ViltForImageAndTextRetrieval", "ViltForTokenClassification", "ViltForMaskedLM", - "XGLMEncoder", - "XGLMDecoder", - "XGLMDecoderWrapper", "PerceiverForMultimodalAutoencoding", "PerceiverForOpticalFlow", "SegformerDecodeHead", "TFSegformerDecodeHead", "FlaxBeitForMaskedImageModeling", - "PLBartEncoder", - "PLBartDecoder", - "PLBartDecoderWrapper", "BeitForMaskedImageModeling", "ChineseCLIPTextModel", "ChineseCLIPVisionModel", @@ -347,7 +276,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ ] # DO NOT edit this list! -# (The corresponding pytorch objects should never be in the main `__init__`, but it's too late to remove) +# (The corresponding pytorch objects should never have been in the main `__init__`, but it's too late to remove) OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK = [ "FlaxBertLayer", "FlaxBigBirdLayer", @@ -361,8 +290,7 @@ OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK = [ "TFViTMAELayer", ] -# Update this list for models that have multiple model types for the same -# model doc +# Update this list for models that have multiple model types for the same model doc. MODEL_TYPE_TO_DOC_MAPPING = OrderedDict( [ ("data2vec-text", "data2vec"), @@ -378,6 +306,10 @@ transformers = direct_transformers_import(PATH_TO_TRANSFORMERS) def check_missing_backends(): + """ + Checks if all backends are installed (otherwise the check of this script is incomplete). Will error in the CI if + that's not the case but only throw a warning for users running this. + """ missing_backends = [] if not is_torch_available(): missing_backends.append("PyTorch") @@ -402,7 +334,9 @@ def check_missing_backends(): def check_model_list(): - """Check the model list inside the transformers library.""" + """ + Checks the model listed as subfolders of `models` match the models available in `transformers.models`. + """ # Get the models from the directory structure of `src/transformers/models/` models_dir = os.path.join(PATH_TO_TRANSFORMERS, "models") _models = [] @@ -413,7 +347,7 @@ def check_model_list(): if os.path.isdir(model_dir) and "__init__.py" in os.listdir(model_dir): _models.append(model) - # Get the models from the directory structure of `src/transformers/models/` + # Get the models in the submodule `transformers.models` models = [model for model in dir(transformers.models) if not model.startswith("__")] missing_models = sorted(set(_models).difference(models)) @@ -425,8 +359,8 @@ def check_model_list(): # If some modeling modules should be ignored for all checks, they should be added in the nested list # _ignore_modules of this function. -def get_model_modules(): - """Get the model modules inside the transformers library.""" +def get_model_modules() -> List[str]: + """Get all the model modules inside the transformers library (except deprecated models).""" _ignore_modules = [ "modeling_auto", "modeling_encoder_decoder", @@ -454,21 +388,32 @@ def get_model_modules(): ] modules = [] for model in dir(transformers.models): - if model == "deprecated": - continue # There are some magic dunder attributes in the dir, we ignore them - if not model.startswith("__"): - model_module = getattr(transformers.models, model) - for submodule in dir(model_module): - if submodule.startswith("modeling") and submodule not in _ignore_modules: - modeling_module = getattr(model_module, submodule) - if inspect.ismodule(modeling_module): - modules.append(modeling_module) + if model == "deprecated" or model.startswith("__"): + continue + + model_module = getattr(transformers.models, model) + for submodule in dir(model_module): + if submodule.startswith("modeling") and submodule not in _ignore_modules: + modeling_module = getattr(model_module, submodule) + if inspect.ismodule(modeling_module): + modules.append(modeling_module) return modules -def get_models(module, include_pretrained=False): - """Get the objects in module that are models.""" +def get_models(module: types.ModuleType, include_pretrained: bool = False) -> List[Tuple[str, type]]: + """ + Get the objects in a module that are models. + + Args: + module (`types.ModuleType`): + The module from which we are extracting models. + include_pretrained (`bool`, *optional*, defaults to `False`): + Whether or not to include the `PreTrainedModel` subclass (like `BertPreTrainedModel`) or not. + + Returns: + List[Tuple[str, type]]: List of models as tuples (class name, actual class). + """ models = [] model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel) for attr_name in dir(module): @@ -480,12 +425,10 @@ def get_models(module, include_pretrained=False): return models -def is_a_private_model(model): - """Returns True if the model should not be in the main init.""" - if model in PRIVATE_MODELS: - return True - - # Wrapper, Encoder and Decoder are all privates +def is_building_block(model: str) -> bool: + """ + Returns `True` if a model is a building block part of a bigger model. + """ if model.endswith("Wrapper"): return True if model.endswith("Encoder"): @@ -494,7 +437,13 @@ def is_a_private_model(model): return True if model.endswith("Prenet"): return True - return False + + +def is_a_private_model(model: str) -> bool: + """Returns `True` if the model should not be in the main init.""" + if model in PRIVATE_MODELS: + return True + return is_building_block(model) def check_models_are_in_init(): @@ -514,11 +463,14 @@ def check_models_are_in_init(): # If some test_modeling files should be ignored when checking models are all tested, they should be added in the # nested list _ignore_files of this function. -def get_model_test_files(): - """Get the model test files. +def get_model_test_files() -> List[str]: + """ + Get the model test files. - The returned files should NOT contain the `tests` (i.e. `PATH_TO_TESTS` defined in this script). They will be - considered as paths relative to `tests`. A caller has to use `os.path.join(PATH_TO_TESTS, ...)` to access the files. + Returns: + `List[str]`: The list of test files. The returned files will NOT contain the `tests` (i.e. `PATH_TO_TESTS` + defined in this script). They will be considered as paths relative to `tests`. A caller has to use + `os.path.join(PATH_TO_TESTS, ...)` to access the files. """ _ignore_files = [ @@ -531,7 +483,6 @@ def get_model_test_files(): "test_modeling_tf_encoder_decoder", ] test_files = [] - # Check both `PATH_TO_TESTS` and `PATH_TO_TESTS/models` model_test_root = os.path.join(PATH_TO_TESTS, "models") model_test_dirs = [] for x in os.listdir(model_test_root): @@ -553,9 +504,17 @@ def get_model_test_files(): # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the tester class # for the all_model_classes variable. -def find_tested_models(test_file): - """Parse the content of test_file to detect what's in all_model_classes""" - # This is a bit hacky but I didn't find a way to import the test_file as a module and read inside the class +def find_tested_models(test_file: str) -> List[str]: + """ + Parse the content of test_file to detect what's in `all_model_classes`. This detects the models that inherit from + the common test class. + + Args: + test_file (`str`): The path to the test file to check + + Returns: + `List[str]`: The list of models tested in that file. + """ with open(os.path.join(PATH_TO_TESTS, test_file), "r", encoding="utf-8", newline="\n") as f: content = f.read() all_models = re.findall(r"all_model_classes\s+=\s+\(\s*\(([^\)]*)\)", content) @@ -571,8 +530,25 @@ def find_tested_models(test_file): return model_tested -def check_models_are_tested(module, test_file): - """Check models defined in module are tested in test_file.""" +def should_be_tested(model_name: str) -> bool: + """ + Whether or not a model should be tested. + """ + if model_name in IGNORE_NON_TESTED: + return False + return not is_building_block(model_name) + + +def check_models_are_tested(module: types.ModuleType, test_file: str) -> List[str]: + """Check models defined in a module are all tested in a given file. + + Args: + module (`types.ModuleType`): The module in which we get the models. + test_file (`str`): The path to the file where the module is tested. + + Returns: + `List[str]`: The list of error messages corresponding to models not tested. + """ # XxxPreTrainedModel are not tested defined_models = get_models(module) tested_models = find_tested_models(test_file) @@ -586,7 +562,7 @@ def check_models_are_tested(module, test_file): ] failures = [] for model_name, _ in defined_models: - if model_name not in tested_models and model_name not in IGNORE_NON_TESTED: + if model_name not in tested_models and should_be_tested(model_name): failures.append( f"{model_name} is defined in {module.__name__} but is not tested in " + f"{os.path.join(PATH_TO_TESTS, test_file)}. Add it to the all_model_classes in that file." @@ -602,6 +578,7 @@ def check_all_models_are_tested(): test_files = get_model_test_files() failures = [] for module in modules: + # Matches a module to its test file. test_file = [file for file in test_files if f"test_{module.__name__.split('.')[-1]}.py" in file] if len(test_file) == 0: failures.append(f"{module.__name__} does not have its corresponding test file {test_file}.") @@ -616,7 +593,7 @@ def check_all_models_are_tested(): raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures)) -def get_all_auto_configured_models(): +def get_all_auto_configured_models() -> List[str]: """Return the list of all models in at least one auto class.""" result = set() # To avoid duplicates we concatenate all model classes in a set. if is_torch_available(): @@ -634,8 +611,8 @@ def get_all_auto_configured_models(): return list(result) -def ignore_unautoclassed(model_name): - """Rules to determine if `name` should be in an auto class.""" +def ignore_unautoclassed(model_name: str) -> bool: + """Rules to determine if a model should be in an auto class.""" # Special white list if model_name in IGNORE_NON_AUTO_CONFIGURED: return True @@ -645,8 +622,19 @@ def ignore_unautoclassed(model_name): return False -def check_models_are_auto_configured(module, all_auto_models): - """Check models defined in module are each in an auto class.""" +def check_models_are_auto_configured(module: types.ModuleType, all_auto_models: List[str]) -> List[str]: + """ + Check models defined in module are each in an auto class. + + Args: + module (`types.ModuleType`): + The module in which we get the models. + all_auto_models (`List[str]`): + The list of all models in an auto class (as obtained with `get_all_auto_configured_models()`). + + Returns: + `List[str]`: The list of error messages corresponding to models not tested. + """ defined_models = get_models(module) failures = [] for model_name, _ in defined_models: @@ -661,6 +649,7 @@ def check_models_are_auto_configured(module, all_auto_models): def check_all_models_are_auto_configured(): """Check all models are each in an auto class.""" + # This is where we need to check we have all backends or the check is incomplete. check_missing_backends() modules = get_model_modules() all_auto_models = get_all_auto_configured_models() @@ -675,6 +664,7 @@ def check_all_models_are_auto_configured(): def check_all_auto_object_names_being_defined(): """Check all names defined in auto (name) mappings exist in the library.""" + # This is where we need to check we have all backends or the check is incomplete. check_missing_backends() failures = [] @@ -695,7 +685,7 @@ def check_all_auto_object_names_being_defined(): mappings_to_check.update({name: getattr(module, name) for name in mapping_names}) for name, mapping in mappings_to_check.items(): - for model_type, class_names in mapping.items(): + for _, class_names in mapping.items(): if not isinstance(class_names, tuple): class_names = (class_names,) for class_name in class_names: @@ -716,6 +706,7 @@ def check_all_auto_object_names_being_defined(): def check_all_auto_mapping_names_in_config_mapping_names(): """Check all keys defined in auto mappings (mappings of names) appear in `CONFIG_MAPPING_NAMES`.""" + # This is where we need to check we have all backends or the check is incomplete. check_missing_backends() failures = [] @@ -736,7 +727,7 @@ def check_all_auto_mapping_names_in_config_mapping_names(): mappings_to_check.update({name: getattr(module, name) for name in mapping_names}) for name, mapping in mappings_to_check.items(): - for model_type, class_names in mapping.items(): + for model_type in mapping: if model_type not in CONFIG_MAPPING_NAMES: failures.append( f"`{model_type}` appears in the mapping `{name}` but it is not defined in the keys of " @@ -747,7 +738,8 @@ def check_all_auto_mapping_names_in_config_mapping_names(): def check_all_auto_mappings_importable(): - """Check all auto mappings could be imported.""" + """Check all auto mappings can be imported.""" + # This is where we need to check we have all backends or the check is incomplete. check_missing_backends() failures = [] @@ -761,7 +753,7 @@ def check_all_auto_mappings_importable(): mapping_names = [x for x in dir(module) if x.endswith("_MAPPING_NAMES")] mappings_to_check.update({name: getattr(module, name) for name in mapping_names}) - for name, _ in mappings_to_check.items(): + for name in mappings_to_check: name = name.replace("_MAPPING_NAMES", "_MAPPING") if not hasattr(transformers, name): failures.append(f"`{name}`") @@ -770,44 +762,46 @@ def check_all_auto_mappings_importable(): def check_objects_being_equally_in_main_init(): - """Check if an object is in the main __init__ if its counterpart in PyTorch is.""" + """ + Check if a (TensorFlow or Flax) object is in the main __init__ iif its counterpart in PyTorch is. + """ attrs = dir(transformers) failures = [] for attr in attrs: obj = getattr(transformers, attr) - if hasattr(obj, "__module__"): - module_path = obj.__module__ - if "models.deprecated" in module_path: - continue - module_name = module_path.split(".")[-1] - module_dir = ".".join(module_path.split(".")[:-1]) - if ( - module_name.startswith("modeling_") - and not module_name.startswith("modeling_tf_") - and not module_name.startswith("modeling_flax_") - ): - parent_module = sys.modules[module_dir] + if not hasattr(obj, "__module__") or "models.deprecated" in obj.__module__: + continue - frameworks = [] - if is_tf_available(): - frameworks.append("TF") - if is_flax_available(): - frameworks.append("Flax") + module_path = obj.__module__ + module_name = module_path.split(".")[-1] + module_dir = ".".join(module_path.split(".")[:-1]) + if ( + module_name.startswith("modeling_") + and not module_name.startswith("modeling_tf_") + and not module_name.startswith("modeling_flax_") + ): + parent_module = sys.modules[module_dir] - for framework in frameworks: - other_module_path = module_path.replace("modeling_", f"modeling_{framework.lower()}_") - if os.path.isfile("src/" + other_module_path.replace(".", "/") + ".py"): - other_module_name = module_name.replace("modeling_", f"modeling_{framework.lower()}_") - other_module = getattr(parent_module, other_module_name) - if hasattr(other_module, f"{framework}{attr}"): - if not hasattr(transformers, f"{framework}{attr}"): - if f"{framework}{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK: - failures.append(f"{framework}{attr}") - if hasattr(other_module, f"{framework}_{attr}"): - if not hasattr(transformers, f"{framework}_{attr}"): - if f"{framework}_{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK: - failures.append(f"{framework}_{attr}") + frameworks = [] + if is_tf_available(): + frameworks.append("TF") + if is_flax_available(): + frameworks.append("Flax") + + for framework in frameworks: + other_module_path = module_path.replace("modeling_", f"modeling_{framework.lower()}_") + if os.path.isfile("src/" + other_module_path.replace(".", "/") + ".py"): + other_module_name = module_name.replace("modeling_", f"modeling_{framework.lower()}_") + other_module = getattr(parent_module, other_module_name) + if hasattr(other_module, f"{framework}{attr}"): + if not hasattr(transformers, f"{framework}{attr}"): + if f"{framework}{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK: + failures.append(f"{framework}{attr}") + if hasattr(other_module, f"{framework}_{attr}"): + if not hasattr(transformers, f"{framework}_{attr}"): + if f"{framework}_{attr}" not in OBJECT_TO_SKIP_IN_MAIN_INIT_CHECK: + failures.append(f"{framework}_{attr}") if len(failures) > 0: raise Exception(f"There were {len(failures)} failures:\n" + "\n".join(failures)) @@ -815,8 +809,16 @@ def check_objects_being_equally_in_main_init(): _re_decorator = re.compile(r"^\s*@(\S+)\s+$") -def check_decorator_order(filename): - """Check that in the test file `filename` the slow decorator is always last.""" +def check_decorator_order(filename: str) -> List[int]: + """ + Check that in a given test file, the slow decorator is always last. + + Args: + filename (`str`): The path to a test file to check. + + Returns: + `List[int]`: The list of failures as a list of indices where there are problems. + """ with open(filename, "r", encoding="utf-8", newline="\n") as f: lines = f.readlines() decorator_before = None @@ -849,8 +851,13 @@ def check_all_decorator_order(): ) -def find_all_documented_objects(): - """Parse the content of all doc files to detect which classes and functions it documents""" +def find_all_documented_objects() -> List[str]: + """ + Parse the content of all doc files to detect which classes and functions it documents. + + Returns: + `List[str]`: The list of all object names being documented. + """ documented_obj = [] for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"): with open(doc_file, "r", encoding="utf-8", newline="\n") as f: @@ -959,8 +966,8 @@ SHOULD_HAVE_THEIR_OWN_PAGE = [ ] -def ignore_undocumented(name): - """Rules to determine if `name` should be undocumented.""" +def ignore_undocumented(name: str) -> bool: + """Rules to determine if `name` should be undocumented (returns `True` if it should not be documented).""" # NOT DOCUMENTED ON PURPOSE. # Constants uppercase are not documented. if name.isupper(): @@ -1047,7 +1054,7 @@ _re_double_backquotes = re.compile(r"(^|[^`])``([^`]+)``([^`]|$)") _re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE) -def is_rst_docstring(docstring): +def is_rst_docstring(docstring: str) -> True: """ Returns `True` if `docstring` is written in rst. """ @@ -1061,7 +1068,7 @@ def is_rst_docstring(docstring): def check_docstrings_are_in_md(): - """Check all docstrings are in md""" + """Check all docstrings are written in md and nor rst.""" files_with_rst = [] for file in Path(PATH_TO_TRANSFORMERS).glob("**/*.py"): with open(file, encoding="utf-8") as f: @@ -1084,6 +1091,9 @@ def check_docstrings_are_in_md(): def check_deprecated_constant_is_up_to_date(): + """ + Check if the constant `DEPRECATED_MODELS` in `models/auto/configuration_auto.py` is up to date. + """ deprecated_folder = os.path.join(PATH_TO_TRANSFORMERS, "models", "deprecated") deprecated_models = [m for m in os.listdir(deprecated_folder) if not m.startswith("_")]