From 07bf2dff78dce04314447efe46ccb70480374daf Mon Sep 17 00:00:00 2001
From: Joseph Enguehard <josephenguehard@gmail.com>
Date: Mon, 20 May 2024 09:06:57 +0100
Subject: [PATCH 01/80] Add TokenClassification for Mistral, Mixtral and Qwen2
 (#29878)

* Add MistralForTokenClassification

* Add tests and docs

* Add token classification for Mixtral and Qwen2

* Save llma for token classification draft

* Add token classification support for Llama, Gemma, Persimmon, StableLm and StarCoder2

* Formatting

* Add token classification support for Qwen2Moe model

* Add dropout layer to each ForTokenClassification model

* Add copied from in tests

* Update src/transformers/models/llama/modeling_llama.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* Propagate suggested changes

* Style

---------

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 docs/source/en/model_doc/gemma.md             |  5 +
 docs/source/en/model_doc/llama.md             |  5 +
 docs/source/en/model_doc/mistral.md           |  5 +
 docs/source/en/model_doc/mixtral.md           |  5 +
 docs/source/en/model_doc/persimmon.md         |  5 +
 docs/source/en/model_doc/qwen2.md             |  5 +
 docs/source/en/model_doc/qwen2_moe.md         |  5 +
 docs/source/en/model_doc/stablelm.md          |  5 +
 docs/source/en/model_doc/starcoder2.md        |  5 +
 src/transformers/__init__.py                  | 25 ++++-
 src/transformers/models/auto/modeling_auto.py |  9 ++
 src/transformers/models/gemma/__init__.py     |  2 +
 .../models/gemma/modeling_gemma.py            | 92 ++++++++++++++++++-
 src/transformers/models/llama/__init__.py     |  2 +
 .../models/llama/modeling_llama.py            | 85 +++++++++++++++++
 src/transformers/models/mistral/__init__.py   |  2 +
 .../models/mistral/modeling_mistral.py        | 92 ++++++++++++++++++-
 src/transformers/models/mixtral/__init__.py   |  2 +
 .../models/mixtral/modeling_mixtral.py        | 86 +++++++++++++++++
 src/transformers/models/persimmon/__init__.py |  2 +
 .../models/persimmon/modeling_persimmon.py    | 92 ++++++++++++++++++-
 src/transformers/models/qwen2/__init__.py     |  2 +
 .../models/qwen2/modeling_qwen2.py            | 92 ++++++++++++++++++-
 src/transformers/models/qwen2_moe/__init__.py |  2 +
 .../models/qwen2_moe/modeling_qwen2_moe.py    | 92 ++++++++++++++++++-
 src/transformers/models/stablelm/__init__.py  |  2 +
 .../models/stablelm/modeling_stablelm.py      | 92 ++++++++++++++++++-
 .../models/starcoder2/__init__.py             |  2 +
 .../models/starcoder2/modeling_starcoder2.py  | 92 ++++++++++++++++++-
 src/transformers/utils/dummy_pt_objects.py    | 63 +++++++++++++
 tests/models/gemma/test_modeling_gemma.py     | 31 ++++++-
 tests/models/llama/test_modeling_llama.py     | 25 ++++-
 tests/models/mistral/test_modeling_mistral.py | 22 ++++-
 tests/models/mixtral/test_modeling_mixtral.py | 28 +++++-
 .../persimmon/test_modeling_persimmon.py      | 22 ++++-
 tests/models/qwen2/test_modeling_qwen2.py     | 24 ++++-
 .../qwen2_moe/test_modeling_qwen2_moe.py      | 22 ++++-
 .../models/stablelm/test_modeling_stablelm.py | 22 ++++-
 .../starcoder2/test_modeling_starcoder2.py    | 22 ++++-
 39 files changed, 1174 insertions(+), 19 deletions(-)

diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md
index f55995b6d8..abd077af8d 100644
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@@ -60,6 +60,11 @@ This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [
 [[autodoc]] GemmaForSequenceClassification
     - forward
 
+## GemmaForTokenClassification
+
+[[autodoc]] GemmaForTokenClassification
+    - forward
+
 ## FlaxGemmaModel
 
 [[autodoc]] FlaxGemmaModel
diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index 915d5ecc70..2f0eb63da0 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -121,6 +121,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] LlamaForQuestionAnswering
     - forward
 
+## LlamaForTokenClassification
+
+[[autodoc]] LlamaForTokenClassification
+    - forward
+
 ## FlaxLlamaModel
 
 [[autodoc]] FlaxLlamaModel
diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md
index 0ab2142061..d4bc761060 100644
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@@ -203,6 +203,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] MistralForSequenceClassification
     - forward
 
+## MistralForTokenClassification
+
+[[autodoc]] MistralForTokenClassification
+    - forward
+
 ## FlaxMistralModel
 
 [[autodoc]] FlaxMistralModel
diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md
index 942b040c3f..b93acdec58 100644
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@@ -204,3 +204,8 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] MixtralForSequenceClassification
     - forward
+
+## MixtralForTokenClassification
+
+[[autodoc]] MixtralForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md
index fe9e66a0b7..7a105ac554 100644
--- a/docs/source/en/model_doc/persimmon.md
+++ b/docs/source/en/model_doc/persimmon.md
@@ -96,3 +96,8 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. T
 
 [[autodoc]] PersimmonForSequenceClassification
     - forward
+
+## PersimmonForTokenClassification
+
+[[autodoc]] PersimmonForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md
index 5f9e5dba22..ac0e25e02c 100644
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@@ -80,3 +80,8 @@ In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inferen
 
 [[autodoc]] Qwen2ForSequenceClassification
     - forward
+
+## Qwen2ForTokenClassification
+
+[[autodoc]] Qwen2ForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md
index 8a546c4016..9c6dc80beb 100644
--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@@ -75,3 +75,8 @@ In the following, we demonstrate how to use `Qwen1.5-MoE-A2.7B-Chat` for the inf
 
 [[autodoc]] Qwen2MoeForSequenceClassification
     - forward
+
+## Qwen2MoeForTokenClassification
+
+[[autodoc]] Qwen2MoeForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md
index 6a50995ca0..09c0e5855c 100644
--- a/docs/source/en/model_doc/stablelm.md
+++ b/docs/source/en/model_doc/stablelm.md
@@ -104,3 +104,8 @@ Now, to run the model with Flash Attention 2, refer to the snippet below:
 
 [[autodoc]] StableLmForSequenceClassification
     - forward
+
+## StableLmForTokenClassification
+
+[[autodoc]] StableLmForTokenClassification
+    - forward
diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md
index 9e2e547b8c..1d107b3855 100644
--- a/docs/source/en/model_doc/starcoder2.md
+++ b/docs/source/en/model_doc/starcoder2.md
@@ -66,3 +66,8 @@ These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hu
 
 [[autodoc]] Starcoder2ForSequenceClassification
     - forward
+
+## Starcoder2ForTokenClassification
+
+[[autodoc]] Starcoder2ForTokenClassification
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 18faabc807..4255e30379 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2031,6 +2031,7 @@ else:
         [
             "GemmaForCausalLM",
             "GemmaForSequenceClassification",
+            "GemmaForTokenClassification",
             "GemmaModel",
             "GemmaPreTrainedModel",
         ]
@@ -2288,6 +2289,7 @@ else:
             "LlamaForCausalLM",
             "LlamaForQuestionAnswering",
             "LlamaForSequenceClassification",
+            "LlamaForTokenClassification",
             "LlamaModel",
             "LlamaPreTrainedModel",
         ]
@@ -2435,12 +2437,19 @@ else:
         [
             "MistralForCausalLM",
             "MistralForSequenceClassification",
+            "MistralForTokenClassification",
             "MistralModel",
             "MistralPreTrainedModel",
         ]
     )
     _import_structure["models.mixtral"].extend(
-        ["MixtralForCausalLM", "MixtralForSequenceClassification", "MixtralModel", "MixtralPreTrainedModel"]
+        [
+            "MixtralForCausalLM",
+            "MixtralForSequenceClassification",
+            "MixtralForTokenClassification",
+            "MixtralModel",
+            "MixtralPreTrainedModel",
+        ]
     )
     _import_structure["models.mobilebert"].extend(
         [
@@ -2714,6 +2723,7 @@ else:
         [
             "PersimmonForCausalLM",
             "PersimmonForSequenceClassification",
+            "PersimmonForTokenClassification",
             "PersimmonModel",
             "PersimmonPreTrainedModel",
         ]
@@ -2810,6 +2820,7 @@ else:
         [
             "Qwen2ForCausalLM",
             "Qwen2ForSequenceClassification",
+            "Qwen2ForTokenClassification",
             "Qwen2Model",
             "Qwen2PreTrainedModel",
         ]
@@ -2818,6 +2829,7 @@ else:
         [
             "Qwen2MoeForCausalLM",
             "Qwen2MoeForSequenceClassification",
+            "Qwen2MoeForTokenClassification",
             "Qwen2MoeModel",
             "Qwen2MoePreTrainedModel",
         ]
@@ -3066,6 +3078,7 @@ else:
         [
             "StableLmForCausalLM",
             "StableLmForSequenceClassification",
+            "StableLmForTokenClassification",
             "StableLmModel",
             "StableLmPreTrainedModel",
         ]
@@ -3074,6 +3087,7 @@ else:
         [
             "Starcoder2ForCausalLM",
             "Starcoder2ForSequenceClassification",
+            "Starcoder2ForTokenClassification",
             "Starcoder2Model",
             "Starcoder2PreTrainedModel",
         ]
@@ -6489,6 +6503,7 @@ if TYPE_CHECKING:
         from .models.gemma import (
             GemmaForCausalLM,
             GemmaForSequenceClassification,
+            GemmaForTokenClassification,
             GemmaModel,
             GemmaPreTrainedModel,
         )
@@ -6686,6 +6701,7 @@ if TYPE_CHECKING:
             LlamaForCausalLM,
             LlamaForQuestionAnswering,
             LlamaForSequenceClassification,
+            LlamaForTokenClassification,
             LlamaModel,
             LlamaPreTrainedModel,
         )
@@ -6801,12 +6817,14 @@ if TYPE_CHECKING:
         from .models.mistral import (
             MistralForCausalLM,
             MistralForSequenceClassification,
+            MistralForTokenClassification,
             MistralModel,
             MistralPreTrainedModel,
         )
         from .models.mixtral import (
             MixtralForCausalLM,
             MixtralForSequenceClassification,
+            MixtralForTokenClassification,
             MixtralModel,
             MixtralPreTrainedModel,
         )
@@ -7025,6 +7043,7 @@ if TYPE_CHECKING:
         from .models.persimmon import (
             PersimmonForCausalLM,
             PersimmonForSequenceClassification,
+            PersimmonForTokenClassification,
             PersimmonModel,
             PersimmonPreTrainedModel,
         )
@@ -7099,12 +7118,14 @@ if TYPE_CHECKING:
         from .models.qwen2 import (
             Qwen2ForCausalLM,
             Qwen2ForSequenceClassification,
+            Qwen2ForTokenClassification,
             Qwen2Model,
             Qwen2PreTrainedModel,
         )
         from .models.qwen2_moe import (
             Qwen2MoeForCausalLM,
             Qwen2MoeForSequenceClassification,
+            Qwen2MoeForTokenClassification,
             Qwen2MoeModel,
             Qwen2MoePreTrainedModel,
         )
@@ -7306,12 +7327,14 @@ if TYPE_CHECKING:
         from .models.stablelm import (
             StableLmForCausalLM,
             StableLmForSequenceClassification,
+            StableLmForTokenClassification,
             StableLmModel,
             StableLmPreTrainedModel,
         )
         from .models.starcoder2 import (
             Starcoder2ForCausalLM,
             Starcoder2ForSequenceClassification,
+            Starcoder2ForTokenClassification,
             Starcoder2Model,
             Starcoder2PreTrainedModel,
         )
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 7825a0217f..c16ab83bdc 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -1038,6 +1038,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
         ("flaubert", "FlaubertForTokenClassification"),
         ("fnet", "FNetForTokenClassification"),
         ("funnel", "FunnelForTokenClassification"),
+        ("gemma", "GemmaForTokenClassification"),
         ("gpt-sw3", "GPT2ForTokenClassification"),
         ("gpt2", "GPT2ForTokenClassification"),
         ("gpt_bigcode", "GPTBigCodeForTokenClassification"),
@@ -1048,11 +1049,14 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
         ("layoutlmv2", "LayoutLMv2ForTokenClassification"),
         ("layoutlmv3", "LayoutLMv3ForTokenClassification"),
         ("lilt", "LiltForTokenClassification"),
+        ("llama", "LlamaForTokenClassification"),
         ("longformer", "LongformerForTokenClassification"),
         ("luke", "LukeForTokenClassification"),
         ("markuplm", "MarkupLMForTokenClassification"),
         ("mega", "MegaForTokenClassification"),
         ("megatron-bert", "MegatronBertForTokenClassification"),
+        ("mistral", "MistralForTokenClassification"),
+        ("mixtral", "MixtralForTokenClassification"),
         ("mobilebert", "MobileBertForTokenClassification"),
         ("mpnet", "MPNetForTokenClassification"),
         ("mpt", "MptForTokenClassification"),
@@ -1060,15 +1064,20 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
         ("mt5", "MT5ForTokenClassification"),
         ("nezha", "NezhaForTokenClassification"),
         ("nystromformer", "NystromformerForTokenClassification"),
+        ("persimmon", "PersimmonForTokenClassification"),
         ("phi", "PhiForTokenClassification"),
         ("phi3", "Phi3ForTokenClassification"),
         ("qdqbert", "QDQBertForTokenClassification"),
+        ("qwen2", "Qwen2ForTokenClassification"),
+        ("qwen2_moe", "Qwen2MoeForTokenClassification"),
         ("rembert", "RemBertForTokenClassification"),
         ("roberta", "RobertaForTokenClassification"),
         ("roberta-prelayernorm", "RobertaPreLayerNormForTokenClassification"),
         ("roc_bert", "RoCBertForTokenClassification"),
         ("roformer", "RoFormerForTokenClassification"),
         ("squeezebert", "SqueezeBertForTokenClassification"),
+        ("stablelm", "StableLmForTokenClassification"),
+        ("starcoder2", "Starcoder2ForTokenClassification"),
         ("t5", "T5ForTokenClassification"),
         ("umt5", "UMT5ForTokenClassification"),
         ("xlm", "XLMForTokenClassification"),
diff --git a/src/transformers/models/gemma/__init__.py b/src/transformers/models/gemma/__init__.py
index 1c832e9051..1aafae6e88 100644
--- a/src/transformers/models/gemma/__init__.py
+++ b/src/transformers/models/gemma/__init__.py
@@ -55,6 +55,7 @@ else:
         "GemmaModel",
         "GemmaPreTrainedModel",
         "GemmaForSequenceClassification",
+        "GemmaForTokenClassification",
     ]
 
 try:
@@ -98,6 +99,7 @@ if TYPE_CHECKING:
         from .modeling_gemma import (
             GemmaForCausalLM,
             GemmaForSequenceClassification,
+            GemmaForTokenClassification,
             GemmaModel,
             GemmaPreTrainedModel,
         )
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index a3fc4b171d..ec88074ad5 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -30,7 +30,12 @@ from ...modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_causal_attention_mask,
 )
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
 from ...utils import (
@@ -1346,3 +1351,88 @@ class GemmaForSequenceClassification(GemmaPreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Gemma Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    GEMMA_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Gemma, LLAMA->GEMMA
+class GemmaForTokenClassification(GemmaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = GemmaModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index 4b8a33118c..3f6461c4c0 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -55,6 +55,7 @@ else:
         "LlamaPreTrainedModel",
         "LlamaForSequenceClassification",
         "LlamaForQuestionAnswering",
+        "LlamaForTokenClassification",
     ]
 
 try:
@@ -95,6 +96,7 @@ if TYPE_CHECKING:
             LlamaForCausalLM,
             LlamaForQuestionAnswering,
             LlamaForSequenceClassification,
+            LlamaForTokenClassification,
             LlamaModel,
             LlamaPreTrainedModel,
         )
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index ba02a7fc77..56a83c178c 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -36,6 +36,7 @@ from ...modeling_outputs import (
     CausalLMOutputWithPast,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
@@ -1516,3 +1517,87 @@ class LlamaForQuestionAnswering(LlamaPreTrainedModel):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Llama Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForTokenClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/mistral/__init__.py b/src/transformers/models/mistral/__init__.py
index dc0b85980f..abf1e32a4b 100644
--- a/src/transformers/models/mistral/__init__.py
+++ b/src/transformers/models/mistral/__init__.py
@@ -32,6 +32,7 @@ else:
         "MistralModel",
         "MistralPreTrainedModel",
         "MistralForSequenceClassification",
+        "MistralForTokenClassification",
     ]
 
 try:
@@ -59,6 +60,7 @@ if TYPE_CHECKING:
         from .modeling_mistral import (
             MistralForCausalLM,
             MistralForSequenceClassification,
+            MistralForTokenClassification,
             MistralModel,
             MistralPreTrainedModel,
         )
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index ccfa034ae6..cc5d8f0862 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -31,7 +31,12 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
@@ -1366,3 +1371,88 @@ class MistralForSequenceClassification(MistralPreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Mistral, LLAMA->MISTRAL
+class MistralForTokenClassification(MistralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MistralModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/mixtral/__init__.py b/src/transformers/models/mixtral/__init__.py
index 7b8f061dac..b124d41dfb 100644
--- a/src/transformers/models/mixtral/__init__.py
+++ b/src/transformers/models/mixtral/__init__.py
@@ -36,6 +36,7 @@ else:
         "MixtralModel",
         "MixtralPreTrainedModel",
         "MixtralForSequenceClassification",
+        "MixtralForTokenClassification",
     ]
 
 
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
         from .modeling_mixtral import (
             MixtralForCausalLM,
             MixtralForSequenceClassification,
+            MixtralForTokenClassification,
             MixtralModel,
             MixtralPreTrainedModel,
         )
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 19e4f15718..7bafe49905 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -38,6 +38,7 @@ from ...modeling_outputs import (
     MoeCausalLMOutputWithPast,
     MoeModelOutputWithPast,
     SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
@@ -1582,3 +1583,88 @@ class MixtralForSequenceClassification(MixtralPreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Mixtral Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    MIXTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Mixtral, LLAMA->MIXTRAL
+class MixtralForTokenClassification(MixtralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MixtralModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/persimmon/__init__.py b/src/transformers/models/persimmon/__init__.py
index 75bc218a29..e1f24ca1b7 100644
--- a/src/transformers/models/persimmon/__init__.py
+++ b/src/transformers/models/persimmon/__init__.py
@@ -36,6 +36,7 @@ else:
         "PersimmonModel",
         "PersimmonPreTrainedModel",
         "PersimmonForSequenceClassification",
+        "PersimmonForTokenClassification",
     ]
 
 
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
         from .modeling_persimmon import (
             PersimmonForCausalLM,
             PersimmonForSequenceClassification,
+            PersimmonForTokenClassification,
             PersimmonModel,
             PersimmonPreTrainedModel,
         )
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 8d4ad53207..75ba7163ba 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -29,7 +29,12 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_persimmon import PersimmonConfig
@@ -1011,3 +1016,88 @@ class PersimmonForSequenceClassification(PersimmonPreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Persimmon Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    PERSIMMON_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Persimmon, LLAMA->PERSIMMON
+class PersimmonForTokenClassification(PersimmonPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = PersimmonModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/qwen2/__init__.py b/src/transformers/models/qwen2/__init__.py
index 3409f28214..35df37e91a 100644
--- a/src/transformers/models/qwen2/__init__.py
+++ b/src/transformers/models/qwen2/__init__.py
@@ -45,6 +45,7 @@ else:
         "Qwen2Model",
         "Qwen2PreTrainedModel",
         "Qwen2ForSequenceClassification",
+        "Qwen2ForTokenClassification",
     ]
 
 
@@ -69,6 +70,7 @@ if TYPE_CHECKING:
         from .modeling_qwen2 import (
             Qwen2ForCausalLM,
             Qwen2ForSequenceClassification,
+            Qwen2ForTokenClassification,
             Qwen2Model,
             Qwen2PreTrainedModel,
         )
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 76b975ce9b..c9b8bd8fde 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -31,7 +31,12 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
@@ -1375,3 +1380,88 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Qwen2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    QWEN2_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2, LLAMA->QWEN2
+class Qwen2ForTokenClassification(Qwen2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/qwen2_moe/__init__.py b/src/transformers/models/qwen2_moe/__init__.py
index fb12383278..e2b73ba2d1 100644
--- a/src/transformers/models/qwen2_moe/__init__.py
+++ b/src/transformers/models/qwen2_moe/__init__.py
@@ -36,6 +36,7 @@ else:
         "Qwen2MoeModel",
         "Qwen2MoePreTrainedModel",
         "Qwen2MoeForSequenceClassification",
+        "Qwen2MoeForTokenClassification",
     ]
 
 
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
         from .modeling_qwen2_moe import (
             Qwen2MoeForCausalLM,
             Qwen2MoeForSequenceClassification,
+            Qwen2MoeForTokenClassification,
             Qwen2MoeModel,
             Qwen2MoePreTrainedModel,
         )
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 914a2dbdf4..f8d1bf6bb4 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -32,7 +32,12 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
@@ -1571,3 +1576,88 @@ class Qwen2MoeForSequenceClassification(Qwen2MoePreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Qwen2MoE Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    QWEN2MOE_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2Moe, LLAMA->QWEN2MOE
+class Qwen2MoeForTokenClassification(Qwen2MoePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Qwen2MoeModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/stablelm/__init__.py b/src/transformers/models/stablelm/__init__.py
index 7fc3a6857f..c00c045f7f 100644
--- a/src/transformers/models/stablelm/__init__.py
+++ b/src/transformers/models/stablelm/__init__.py
@@ -36,6 +36,7 @@ else:
         "StableLmModel",
         "StableLmPreTrainedModel",
         "StableLmForSequenceClassification",
+        "StableLmForTokenClassification",
     ]
 
 
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
         from .modeling_stablelm import (
             StableLmForCausalLM,
             StableLmForSequenceClassification,
+            StableLmForTokenClassification,
             StableLmModel,
             StableLmPreTrainedModel,
         )
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index bc133ffb3d..91a6e83a8b 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -30,7 +30,12 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
@@ -1383,3 +1388,88 @@ class StableLmForSequenceClassification(StableLmPreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The StableLm Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    STABLELM_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->StableLm, LLAMA->STABLELM
+class StableLmForTokenClassification(StableLmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = StableLmModel(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/starcoder2/__init__.py b/src/transformers/models/starcoder2/__init__.py
index 1eb195fde1..d9dc2cd1e5 100644
--- a/src/transformers/models/starcoder2/__init__.py
+++ b/src/transformers/models/starcoder2/__init__.py
@@ -36,6 +36,7 @@ else:
         "Starcoder2Model",
         "Starcoder2PreTrainedModel",
         "Starcoder2ForSequenceClassification",
+        "Starcoder2ForTokenClassification",
     ]
 
 
@@ -51,6 +52,7 @@ if TYPE_CHECKING:
         from .modeling_starcoder2 import (
             Starcoder2ForCausalLM,
             Starcoder2ForSequenceClassification,
+            Starcoder2ForTokenClassification,
             Starcoder2Model,
             Starcoder2PreTrainedModel,
         )
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index 24d285e0b3..ae64b23aa6 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -31,7 +31,12 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
-from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     add_start_docstrings,
@@ -1357,3 +1362,88 @@ class Starcoder2ForSequenceClassification(Starcoder2PreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    The Starcoder2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    STARCODER2_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Starcoder2, LLAMA->STARCODER2
+class Starcoder2ForTokenClassification(Starcoder2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Starcoder2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 681f858556..5e00230aed 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3692,6 +3692,13 @@ class GemmaForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class GemmaForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GemmaModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -4642,6 +4649,13 @@ class LlamaForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class LlamaForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class LlamaModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5237,6 +5251,13 @@ class MistralForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class MistralForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MistralModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -5265,6 +5286,13 @@ class MixtralForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class MixtralForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class MixtralModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6373,6 +6401,13 @@ class PersimmonForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class PersimmonForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PersimmonModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6734,6 +6769,13 @@ class Qwen2ForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class Qwen2ForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Qwen2Model(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -6762,6 +6804,13 @@ class Qwen2MoeForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class Qwen2MoeForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Qwen2MoeModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7793,6 +7842,13 @@ class StableLmForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class StableLmForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class StableLmModel(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7821,6 +7877,13 @@ class Starcoder2ForSequenceClassification(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class Starcoder2ForTokenClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class Starcoder2Model(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index 7cc2ebe7bd..8281f59993 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -41,7 +41,13 @@ from ...test_pipeline_mixin import PipelineTesterMixin
 if is_torch_available():
     import torch
 
-    from transformers import GemmaForCausalLM, GemmaForSequenceClassification, GemmaModel, GemmaTokenizer
+    from transformers import (
+        GemmaForCausalLM,
+        GemmaForSequenceClassification,
+        GemmaForTokenClassification,
+        GemmaModel,
+        GemmaTokenizer,
+    )
 
 
 class GemmaModelTester:
@@ -284,12 +290,17 @@ class GemmaModelTester:
 
 @require_torch
 class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (GemmaModel, GemmaForCausalLM, GemmaForSequenceClassification) if is_torch_available() else ()
+    all_model_classes = (
+        (GemmaModel, GemmaForCausalLM, GemmaForSequenceClassification, GemmaForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
     all_generative_model_classes = (GemmaForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "feature-extraction": GemmaModel,
             "text-classification": GemmaForSequenceClassification,
+            "token-classification": GemmaForTokenClassification,
             "text-generation": GemmaForCausalLM,
             "zero-shot": GemmaForSequenceClassification,
         }
@@ -370,6 +381,22 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Gemma,llama->Gemma
+    def test_Gemma_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = GemmaForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Gemma buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 9fcdcc6d7e..58269d62e0 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -47,6 +47,7 @@ if is_torch_available():
         LlamaForCausalLM,
         LlamaForQuestionAnswering,
         LlamaForSequenceClassification,
+        LlamaForTokenClassification,
         LlamaModel,
         LlamaTokenizer,
     )
@@ -286,7 +287,13 @@ class LlamaModelTester:
 @require_torch
 class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (LlamaModel, LlamaForCausalLM, LlamaForSequenceClassification, LlamaForQuestionAnswering)
+        (
+            LlamaModel,
+            LlamaForCausalLM,
+            LlamaForSequenceClassification,
+            LlamaForQuestionAnswering,
+            LlamaForTokenClassification,
+        )
         if is_torch_available()
         else ()
     )
@@ -298,6 +305,7 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
             "text-generation": LlamaForCausalLM,
             "zero-shot": LlamaForSequenceClassification,
             "question-answering": LlamaForQuestionAnswering,
+            "token-classification": LlamaForTokenClassification,
         }
         if is_torch_available()
         else {}
@@ -370,6 +378,21 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    def test_llama_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = LlamaForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Llama buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index bbc36c050e..10dcb3bf4d 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -46,6 +46,7 @@ if is_torch_available():
     from transformers import (
         MistralForCausalLM,
         MistralForSequenceClassification,
+        MistralForTokenClassification,
         MistralModel,
     )
 
@@ -288,13 +289,16 @@ class MistralModelTester:
 @require_torch
 class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (MistralModel, MistralForCausalLM, MistralForSequenceClassification) if is_torch_available() else ()
+        (MistralModel, MistralForCausalLM, MistralForSequenceClassification, MistralForTokenClassification)
+        if is_torch_available()
+        else ()
     )
     all_generative_model_classes = (MistralForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "feature-extraction": MistralModel,
             "text-classification": MistralForSequenceClassification,
+            "token-classification": MistralForTokenClassification,
             "text-generation": MistralForCausalLM,
             "zero-shot": MistralForSequenceClassification,
         }
@@ -376,6 +380,22 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Mistral,llama->Mistral
+    def test_Mistral_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = MistralForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Mistral buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 0d92595d8c..0972207fce 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -40,7 +40,12 @@ from ...test_pipeline_mixin import PipelineTesterMixin
 if is_torch_available():
     import torch
 
-    from transformers import MixtralForCausalLM, MixtralForSequenceClassification, MixtralModel
+    from transformers import (
+        MixtralForCausalLM,
+        MixtralForSequenceClassification,
+        MixtralForTokenClassification,
+        MixtralModel,
+    )
 
 
 class MixtralModelTester:
@@ -287,13 +292,16 @@ class MixtralModelTester:
 # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Mixtral
 class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (MixtralModel, MixtralForCausalLM, MixtralForSequenceClassification) if is_torch_available() else ()
+        (MixtralModel, MixtralForCausalLM, MixtralForSequenceClassification, MixtralForTokenClassification)
+        if is_torch_available()
+        else ()
     )
     all_generative_model_classes = (MixtralForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "feature-extraction": MixtralModel,
             "text-classification": MixtralForSequenceClassification,
+            "token-classification": MixtralForTokenClassification,
             "text-generation": MixtralForCausalLM,
             "zero-shot": MixtralForSequenceClassification,
         }
@@ -375,6 +383,22 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Mixtral,llama->Mixtral
+    def test_Mixtral_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = MixtralForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Mixtral buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 86a69d774f..46a650c55a 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -44,6 +44,7 @@ if is_torch_available():
         AutoTokenizer,
         PersimmonForCausalLM,
         PersimmonForSequenceClassification,
+        PersimmonForTokenClassification,
         PersimmonModel,
     )
     from transformers.models.persimmon.modeling_persimmon import (
@@ -283,12 +284,15 @@ class PersimmonModelTester:
 @require_torch
 class PersimmonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (PersimmonModel, PersimmonForCausalLM, PersimmonForSequenceClassification) if is_torch_available() else ()
+        (PersimmonModel, PersimmonForCausalLM, PersimmonForSequenceClassification, PersimmonForTokenClassification)
+        if is_torch_available()
+        else ()
     )
     pipeline_model_mapping = (
         {
             "feature-extraction": PersimmonModel,
             "text-classification": PersimmonForSequenceClassification,
+            "token-classification": PersimmonForTokenClassification,
             # TODO (ydshieh): check why these two fail. Fix them or skip them in a better way.
             # "text-generation": PersimmonForCausalLM,
             # "zero-shot": PersimmonForSequenceClassification,
@@ -365,6 +369,22 @@ class PersimmonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Persimmon,llama->persimmon
+    def test_persimmon_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = PersimmonForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Persimmon buffers include complex numbers, which breaks this test")
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_save_load_fast_init_from_base
     def test_save_load_fast_init_from_base(self):
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
index f4e88a97f0..54718c4303 100644
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -45,6 +45,7 @@ if is_torch_available():
     from transformers import (
         Qwen2ForCausalLM,
         Qwen2ForSequenceClassification,
+        Qwen2ForTokenClassification,
         Qwen2Model,
     )
 
@@ -299,12 +300,17 @@ class Qwen2ModelTester:
 @require_torch
 # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen2
 class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (Qwen2Model, Qwen2ForCausalLM, Qwen2ForSequenceClassification) if is_torch_available() else ()
+    all_model_classes = (
+        (Qwen2Model, Qwen2ForCausalLM, Qwen2ForSequenceClassification, Qwen2ForTokenClassification)
+        if is_torch_available()
+        else ()
+    )
     all_generative_model_classes = (Qwen2ForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "feature-extraction": Qwen2Model,
             "text-classification": Qwen2ForSequenceClassification,
+            "token-classification": Qwen2ForTokenClassification,
             "text-generation": Qwen2ForCausalLM,
             "zero-shot": Qwen2ForSequenceClassification,
         }
@@ -387,6 +393,22 @@ class Qwen2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Qwen2,llama->Qwen2
+    def test_Qwen2_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = Qwen2ForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Qwen2 buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index f0818e680d..48c6ccf78a 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -45,6 +45,7 @@ if is_torch_available():
     from transformers import (
         Qwen2MoeForCausalLM,
         Qwen2MoeForSequenceClassification,
+        Qwen2MoeForTokenClassification,
         Qwen2MoeModel,
     )
 
@@ -327,13 +328,16 @@ class Qwen2MoeModelTester:
 # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Qwen2Moe
 class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (Qwen2MoeModel, Qwen2MoeForCausalLM, Qwen2MoeForSequenceClassification) if is_torch_available() else ()
+        (Qwen2MoeModel, Qwen2MoeForCausalLM, Qwen2MoeForSequenceClassification, Qwen2MoeForTokenClassification)
+        if is_torch_available()
+        else ()
     )
     all_generative_model_classes = (Qwen2MoeForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "feature-extraction": Qwen2MoeModel,
             "text-classification": Qwen2MoeForSequenceClassification,
+            "token-classification": Qwen2MoeForTokenClassification,
             "text-generation": Qwen2MoeForCausalLM,
             "zero-shot": Qwen2MoeForSequenceClassification,
         }
@@ -414,6 +418,22 @@ class Qwen2MoeModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Qwen2Moe,llama->Qwen2Moe
+    def test_Qwen2Moe_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = Qwen2MoeForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Qwen2Moe buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py
index f5e74ead9b..083f928612 100644
--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -43,6 +43,7 @@ if is_torch_available():
         AutoTokenizer,
         StableLmForCausalLM,
         StableLmForSequenceClassification,
+        StableLmForTokenClassification,
         StableLmModel,
     )
     from transformers.models.stablelm.modeling_stablelm import (
@@ -287,12 +288,15 @@ class StableLmModelTester:
 # Copied from transformers.tests.persimmon.test_modeling_persimmon.PersimmonModelTest with Persimmon -> StableLm
 class StableLmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (StableLmModel, StableLmForCausalLM, StableLmForSequenceClassification) if is_torch_available() else ()
+        (StableLmModel, StableLmForCausalLM, StableLmForSequenceClassification, StableLmForTokenClassification)
+        if is_torch_available()
+        else ()
     )
     pipeline_model_mapping = (
         {
             "feature-extraction": StableLmModel,
             "text-classification": StableLmForSequenceClassification,
+            "token-classification": StableLmForTokenClassification,
             # TODO (ydshieh): check why these two fail. Fix them or skip them in a better way.
             # "text-generation": StableLmForCausalLM,
             # "zero-shot": StableLmForSequenceClassification,
@@ -356,6 +360,22 @@ class StableLmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->StableLm,llama->stablelm
+    def test_stablelm_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = StableLmForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @parameterized.expand([("linear",), ("dynamic",)])
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_model_rope_scaling_from_config with Llama->StableLm
     def test_model_rope_scaling_from_config(self, scaling_type):
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index 95f604d06b..faba4d254b 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -43,6 +43,7 @@ if is_torch_available():
         AutoTokenizer,
         Starcoder2ForCausalLM,
         Starcoder2ForSequenceClassification,
+        Starcoder2ForTokenClassification,
         Starcoder2Model,
     )
 
@@ -290,13 +291,16 @@ class Starcoder2ModelTester:
 # Copied from transformers.tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Starcoder2
 class Starcoder2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
-        (Starcoder2Model, Starcoder2ForCausalLM, Starcoder2ForSequenceClassification) if is_torch_available() else ()
+        (Starcoder2Model, Starcoder2ForCausalLM, Starcoder2ForSequenceClassification, Starcoder2ForTokenClassification)
+        if is_torch_available()
+        else ()
     )
     all_generative_model_classes = (Starcoder2ForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "feature-extraction": Starcoder2Model,
             "text-classification": Starcoder2ForSequenceClassification,
+            "token-classification": Starcoder2ForTokenClassification,
             "text-generation": Starcoder2ForCausalLM,
             "zero-shot": Starcoder2ForSequenceClassification,
         }
@@ -370,6 +374,22 @@ class Starcoder2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_llama_token_classification_model with Llama->Starcoder2,llama->Starcoder2
+    def test_Starcoder2_token_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        token_labels = ids_tensor([self.model_tester.batch_size, self.model_tester.seq_length], config.num_labels)
+        model = Starcoder2ForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=token_labels)
+        self.assertEqual(
+            result.logits.shape,
+            (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels),
+        )
+
     @unittest.skip("Starcoder2 buffers include complex numbers, which breaks this test")
     def test_save_load_fast_init_from_base(self):
         pass

From fce78fd0e917312da36cea4d352896a8f6ec9cdc Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Mon, 20 May 2024 10:08:26 +0200
Subject: [PATCH 02/80] FIX / Quantization: Fix Dockerfile build (#30890)

* Update Dockerfile

* Update docker/transformers-quantization-latest-gpu/Dockerfile
---
 docker/transformers-quantization-latest-gpu/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 0fd441bc05..2b74dca91f 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive

From cd6bd0af3435ab731f7ed1147def5fae3ffc941e Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Mon, 20 May 2024 03:36:57 -0500
Subject: [PATCH 03/80] Add support for torch.compile dynamic shapes (#30560)

* add torch.compile dynamic support

* Add SDPA dynamic shapes compile test & improve SDPA comment

* comment consistency
---
 src/transformers/models/bart/modeling_bart.py |  8 +++-
 src/transformers/models/bert/modeling_bert.py |  8 ++--
 .../models/cohere/modeling_cohere.py          |  4 +-
 .../data2vec/modeling_data2vec_audio.py       |  8 +++-
 .../models/falcon/modeling_falcon.py          | 22 ++++++----
 .../models/gemma/modeling_gemma.py            |  4 +-
 .../gpt_bigcode/modeling_gpt_bigcode.py       |  9 +++-
 .../models/hubert/modeling_hubert.py          |  8 +++-
 .../models/idefics/modeling_idefics.py        | 12 ++++--
 .../models/llama/modeling_llama.py            |  4 +-
 .../models/mistral/modeling_mistral.py        |  8 +++-
 .../models/mixtral/modeling_mixtral.py        |  8 +++-
 .../models/musicgen/modeling_musicgen.py      |  8 +++-
 .../modeling_musicgen_melody.py               |  8 +++-
 src/transformers/models/olmo/modeling_olmo.py |  4 +-
 src/transformers/models/phi/modeling_phi.py   |  6 ++-
 src/transformers/models/phi3/modeling_phi3.py |  8 +++-
 .../models/qwen2/modeling_qwen2.py            |  8 +++-
 .../models/qwen2_moe/modeling_qwen2_moe.py    |  8 +++-
 src/transformers/models/sew/modeling_sew.py   |  8 +++-
 .../models/stablelm/modeling_stablelm.py      |  8 +++-
 .../models/starcoder2/modeling_starcoder2.py  |  8 +++-
 .../models/unispeech/modeling_unispeech.py    |  8 +++-
 .../unispeech_sat/modeling_unispeech_sat.py   |  8 +++-
 .../models/wav2vec2/modeling_wav2vec2.py      |  8 +++-
 .../models/whisper/modeling_whisper.py        |  8 +++-
 tests/test_modeling_common.py                 | 41 +++++++++++++++++++
 27 files changed, 190 insertions(+), 60 deletions(-)

diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 33fea9a018..bcc9ac3e07 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -592,6 +592,11 @@ class BartSdpaAttention(BartAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -600,8 +605,7 @@ class BartSdpaAttention(BartAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index b516a97187..033dc6ba66 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -428,9 +428,11 @@ class BertSdpaSelfAttention(BertSelfAttention):
             key_layer = key_layer.contiguous()
             value_layer = value_layer.contiguous()
 
-        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal
-        # mask in case tgt_len == 1.
-        is_causal = self.is_decoder and attention_mask is None and tgt_len > 1
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+        # a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_decoder and attention_mask is None and tgt_len > 1 else False
 
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_layer,
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index d6f37af8da..41c4e151a3 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -587,8 +587,8 @@ class CohereSdpaAttention(CohereAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
-        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         is_causal = True if causal_mask is None and q_len > 1 else False
 
         attn_output = torch.nn.functional.scaled_dot_product_attention(
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index e77bc728ab..8837c278c5 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -788,6 +788,11 @@ class Data2VecAudioSdpaAttention(Data2VecAudioAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -796,8 +801,7 @@ class Data2VecAudioSdpaAttention(Data2VecAudioAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index 0a14fcb37b..b9fbf8d70b 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -434,16 +434,19 @@ class FalconAttention(nn.Module):
 
         if alibi is None:
             if self._use_sdpa and not output_attentions:
-                attn_output = F.scaled_dot_product_attention(
+                # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+                # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+                # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not
+                # create a causal mask in case query_length == 1.
+                is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
                     query_layer,
                     key_layer,
                     value_layer,
-                    attention_mask,
-                    0.0,
-                    # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
-                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
+                    attn_mask=attention_mask,
+                    dropout_p=0.0,
+                    is_causal=is_causal,
                 )
-
                 attention_scores = None
             else:
                 attention_scores = query_layer @ key_layer.transpose(-1, -2)
@@ -466,13 +469,16 @@ class FalconAttention(nn.Module):
 
         else:
             if self._use_sdpa and not output_attentions and head_mask is None:
-                attn_output = F.scaled_dot_product_attention(
+                # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+                # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+                is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
                     query_layer,
                     key_layer,
                     value_layer,
                     attn_mask=attention_mask,
                     dropout_p=self.attention_dropout.p if self.training else 0.0,
-                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
+                    is_causal=is_causal,
                 )
                 attn_output = attn_output.transpose(1, 2)
                 attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index ec88074ad5..200f91f750 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -571,8 +571,8 @@ class GemmaSdpaAttention(GemmaAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
-        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         is_causal = True if causal_mask is None and q_len > 1 else False
 
         attn_output = torch.nn.functional.scaled_dot_product_attention(
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 37ed2aba62..7b7bfaf1d4 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -549,14 +549,19 @@ class GPTBigCodeSdpaAttention(GPTBigCodeAttention):
                 key = key.contiguous()
                 value = value.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not
+        # create a causal mask in case query_length == 1.
+        is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+
         sdpa_result = torch.nn.functional.scaled_dot_product_attention(
             query,
             key,
             value,
             attn_mask=attention_mask,
             dropout_p=self.attn_pdrop if self.training else 0.0,
-            # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
-            is_causal=self.is_causal and attention_mask is None and query_length > 1,
+            is_causal=is_causal,
             scale=scale,
         )
 
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 3d1d0884c6..83d90f5ded 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -852,6 +852,11 @@ class HubertSdpaAttention(HubertAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -860,8 +865,7 @@ class HubertSdpaAttention(HubertAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 622e336fe4..68ec6ab8a5 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -660,14 +660,18 @@ class IdeficsAttention(nn.Module):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        attn_output = nn.functional.scaled_dot_product_attention(
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
-            dropout_p=self.dropout,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 56a83c178c..5d8a3f987a 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -643,8 +643,8 @@ class LlamaSdpaAttention(LlamaAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
-        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         is_causal = True if causal_mask is None and q_len > 1 else False
 
         attn_output = torch.nn.functional.scaled_dot_product_attention(
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index cc5d8f0862..a524c5958e 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -676,14 +676,18 @@ class MistralSdpaAttention(MistralAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 7bafe49905..dfdd114728 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -749,14 +749,18 @@ class MixtralSdpaAttention(MixtralAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 8e8b1fe284..d6f0ae96f4 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -618,6 +618,11 @@ class MusicgenSdpaAttention(MusicgenAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -626,8 +631,7 @@ class MusicgenSdpaAttention(MusicgenAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 9865a4b917..6458df0a1b 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -634,6 +634,11 @@ class MusicgenMelodySdpaAttention(MusicgenMelodyAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -642,8 +647,7 @@ class MusicgenMelodySdpaAttention(MusicgenMelodyAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 8bfa3dc606..9b4b08239b 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -617,8 +617,8 @@ class OlmoSdpaAttention(OlmoAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
-        # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         is_causal = True if causal_mask is None and q_len > 1 else False
 
         attn_output = torch.nn.functional.scaled_dot_product_attention(
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 795ff18e5b..1f82b09a25 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -709,13 +709,17 @@ class PhiSdpaAttention(PhiAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 41765632b5..224aad0085 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -778,14 +778,18 @@ class Phi3SdpaAttention(Phi3Attention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index c9b8bd8fde..f90d61d6e6 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -681,14 +681,18 @@ class Qwen2SdpaAttention(Qwen2Attention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index f8d1bf6bb4..21d8d608ee 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -759,14 +759,18 @@ class Qwen2MoeSdpaAttention(Qwen2MoeAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 60cbb777c7..199d0e543a 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -852,6 +852,11 @@ class SEWSdpaAttention(SEWAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -860,8 +865,7 @@ class SEWSdpaAttention(SEWAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 91a6e83a8b..e8d07340d3 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -487,14 +487,18 @@ class StableLmSdpaAttention(StableLmAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout.p if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index ae64b23aa6..d808a3fd25 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -660,14 +660,18 @@ class Starcoder2SdpaAttention(Starcoder2Attention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 5c1557fb1f..aa4bb7827e 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -888,6 +888,11 @@ class UniSpeechSdpaAttention(UniSpeechAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -896,8 +901,7 @@ class UniSpeechSdpaAttention(UniSpeechAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 853c521e5e..4663fc05d8 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -905,6 +905,11 @@ class UniSpeechSatSdpaAttention(UniSpeechSatAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -913,8 +918,7 @@ class UniSpeechSatSdpaAttention(UniSpeechSatAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index ec928762b5..5fb64c1f2c 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -953,6 +953,11 @@ class Wav2Vec2SdpaAttention(Wav2Vec2Attention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -961,8 +966,7 @@ class Wav2Vec2SdpaAttention(Wav2Vec2Attention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index c0db404e5c..e4fda437bf 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -686,6 +686,11 @@ class WhisperSdpaAttention(WhisperAttention):
 
         query_states = self._shape(query_states, tgt_len, bsz)
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
         # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
         # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
         attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -694,8 +699,7 @@ class WhisperSdpaAttention(WhisperAttention):
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout if self.training else 0.0,
-            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+            is_causal=is_causal,
         )
 
         if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 533eedb7f3..5f3ac898da 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4014,6 +4014,47 @@ class ModelTesterMixin:
                 with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
                     _ = model(**inputs_dict)
 
+    @require_torch_sdpa
+    @require_torch_gpu
+    @slow
+    def test_sdpa_can_compile_dynamic(self):
+        compute_capability = torch.cuda.get_device_capability()
+        major, _ = compute_capability
+
+        if not torch.version.cuda or major < 8:
+            self.skipTest("This test requires an NVIDIA GPU with compute capability >= 8.0")
+
+        for model_class in self.all_model_classes:
+            if not model_class._supports_sdpa:
+                self.skipTest(f"{model_class.__name__} does not support SDPA")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            if config.model_type in ["dbrx"]:
+                self.skipTest(
+                    "DBRX (transformers==4.40) requires a modification to support dynamic shapes with compile."
+                )
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, attn_implementation="sdpa")
+                model.to(torch_device)
+
+                # For PyTorch 2.1 - 2.3.0 set `dynamic=True`. In the future setting `dynamic=None` and using `torch._dynamo.mark_dynamic()`
+                # on input tensors will be required. `mark_dynamic` currently raises inconsistent shape errors.
+                model = torch.compile(model, dynamic=True)
+
+                inputs_dict.pop("attention_mask", None)
+                inputs_dict.pop("decoder_attention_mask", None)
+                for name, inp in inputs_dict.items():
+                    if isinstance(inp, torch.Tensor) and inp.dtype in [torch.float32, torch.float16]:
+                        inputs_dict[name] = inp.to(torch.float16)
+
+                # use no_grad to save some memory
+                with torch.no_grad():
+                    _ = model(**inputs_dict)
+
     @require_torch_sdpa
     @slow
     def test_eager_matches_sdpa_generate(self):

From 5d0bf59b4d5be72c8c956e0240a67d7c3100fdaf Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Mon, 20 May 2024 13:45:56 +0500
Subject: [PATCH 04/80] LLaVa-Next: Update docs with batched inference (#30857)

* update docs with batch ex

* Update docs/source/en/model_doc/llava_next.md

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>

* accept nested list of img

---------

Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
---
 docs/source/en/model_doc/llava_next.md        | 41 +++++++++++++++++++
 .../llava_next/image_processing_llava_next.py | 26 +++++++++++-
 .../test_image_processor_llava_next.py        | 18 ++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index a2a3913fca..a4a1419ee0 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -68,6 +68,8 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
 
 ## Usage example
 
+### Single image inference
+
 Here's how to load the model and perform inference in half-precision (`torch.float16`):
 
 ```python
@@ -94,6 +96,45 @@ output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))
 ```
 
+### Multi image inference
+
+LLaVa-Next can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it:
+
+```python
+import requests
+from PIL import Image
+import torch
+from transformers import AutoProcessor, LlavaNextForConditionalGeneration
+
+# Load the model in half-precision
+model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+
+# Get three different images
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image_stop = Image.open(requests.get(url, stream=True).raw)
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image_cats = Image.open(requests.get(url, stream=True).raw)
+
+url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+image_snowman = Image.open(requests.get(url, stream=True).raw)
+
+# Prepare a batched prompt, where the first one is a multi-turn conversation and the second is not
+prompt = [
+    "[INST] <image>\nWhat is shown in this image? [/INST] There is a red stop sign in the image. [INST] <image>\nWhat about this image? How many cats do you see [/INST]",
+    "[INST] <image>\nWhat is shown in this image? [/INST]"
+]
+
+# We can simply feed images in the order they have to be used in the text prompt
+# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
+inputs = processor(text=prompt, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device)
+
+# Generate
+generate_ids = model.generate(**inputs, max_new_tokens=30)
+processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+```
+
 ## Model optimization
 
 ### Quantization using Bitsandbytes
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
index 34de0f4db0..6295fb9562 100644
--- a/src/transformers/models/llava_next/image_processing_llava_next.py
+++ b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -37,6 +37,7 @@ from ...image_utils import (
     get_image_size,
     infer_channel_dimension_format,
     is_scaled_image,
+    is_valid_image,
     make_list_of_images,
     to_numpy_array,
     valid_images,
@@ -52,6 +53,29 @@ if is_vision_available():
     from PIL import Image
 
 
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched video from {images}")
+
+
 def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
     """
     Divides an image into patches of a specified size.
@@ -651,7 +675,7 @@ class LlavaNextImageProcessor(BaseImageProcessor):
         do_pad = do_pad if do_pad is not None else self.do_pad
         do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
 
-        images = make_list_of_images(images)
+        images = make_batched_images(images)
 
         if not valid_images(images):
             raise ValueError(
diff --git a/tests/models/llava_next/test_image_processor_llava_next.py b/tests/models/llava_next/test_image_processor_llava_next.py
index 7369f8a918..8b1f98bbca 100644
--- a/tests/models/llava_next/test_image_processor_llava_next.py
+++ b/tests/models/llava_next/test_image_processor_llava_next.py
@@ -199,3 +199,21 @@ class LlavaNextImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
     @unittest.skip("LlavaNextImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
+
+    def test_nested_input(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=True)
+
+        # Test batched as a list of images
+        encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images.shape), expected_output_image_shape)
+
+        # Test batched as a nested list of images, where each sublist is one batch
+        image_inputs_nested = [image_inputs[:3], image_inputs[3:]]
+        encoded_images_nested = image_processing(image_inputs_nested, return_tensors="pt").pixel_values
+        expected_output_image_shape = (7, 1445, 3, 18, 18)
+        self.assertEqual(tuple(encoded_images_nested.shape), expected_output_image_shape)
+
+        # Image processor should return same pixel values, independently of ipnut format
+        self.assertTrue((encoded_images_nested == encoded_images).all())

From 66b0d9ee5d440518a990d604025f916ac710d701 Mon Sep 17 00:00:00 2001
From: Donggeun Yu <ddonggeunn@gmail.com>
Date: Mon, 20 May 2024 17:51:04 +0900
Subject: [PATCH 05/80] DeformableDETR two stage support bfloat16  (#30907)

Update modeling_deformable_detr.py
---
 .../models/deformable_detr/modeling_deformable_detr.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index bab552da35..b77ae4f8f8 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1616,8 +1616,8 @@ class DeformableDetrModel(DeformableDetrPreTrainedModel):
             valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
 
             grid_y, grid_x = meshgrid(
-                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
-                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, height - 1, height, dtype=enc_output.dtype, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=enc_output.dtype, device=enc_output.device),
                 indexing="ij",
             )
             grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)

From 1c2bb3ac54d18a0bfc3e212b73f8d1c4aac3ea48 Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Mon, 20 May 2024 10:53:58 +0200
Subject: [PATCH 06/80] add return_token_timestamps to WhisperProcessor
 (#30812)

* compute num_frames in WhisperFeatureExtractor

* add return_num_frames in WhisperFeatureProcessor + adapt pipeline

* return_timestamps renaming + pipeline fix

* fix

* fix

* fix

* add tests

* Update src/transformers/models/whisper/feature_extraction_whisper.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* apply review changes

* fix

* Update src/transformers/models/whisper/feature_extraction_whisper.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update tests/models/whisper/test_modeling_whisper.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* apply review

* fix

* review changes

* Update src/transformers/models/whisper/feature_extraction_whisper.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* make style quality

* EXPECTED_OUTPUT in single line

* small numpy->torch fix

* fix

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../whisper/feature_extraction_whisper.py     |  8 ++
 .../models/whisper/generation_whisper.py      | 15 ++-
 .../pipelines/automatic_speech_recognition.py | 28 +++---
 tests/models/whisper/test_modeling_whisper.py | 93 +++++++++++++++++++
 4 files changed, 127 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 508e85b91f..f2d6da5660 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -188,6 +188,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
         sampling_rate: Optional[int] = None,
         do_normalize: Optional[bool] = None,
         device: Optional[str] = "cpu",
+        return_token_timestamps: Optional[bool] = None,
         **kwargs,
     ) -> BatchFeature:
         """
@@ -237,6 +238,9 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
             device (`str`, *optional*, defaults to `'cpu'`):
                 Specifies the device for computation of the log-mel spectrogram of audio signals in the
                 `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
+            return_token_timestamps (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the number of frames of the input raw_speech.
+                These num_frames can be used by the model to compute word level timestamps.
         """
 
         if sampling_rate is not None:
@@ -302,6 +306,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
 
         if isinstance(input_features[0], List):
             padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+
         else:
             padded_inputs["input_features"] = input_features
 
@@ -309,6 +314,9 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor):
             # rescale from sample (48000) to feature (3000)
             padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]
 
+        if return_token_timestamps is not None:
+            padded_inputs["num_frames"] = [len(raw_speech_i) // self.hop_length for raw_speech_i in raw_speech]
+
         if return_tensors is not None:
             padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
 
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index c58b0d35e5..2bdff6e534 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -209,11 +209,15 @@ class WhisperGenerationMixin:
             # 2. num_frames is different, compute the DTW matrix for each sample sequentially
 
             # we're using np.unique because num_frames can be int/list/tuple
-            if len(np.unique(num_frames)) == 1:
-                # if num_frames is the same, no need to recompute matrix, std and mean for each element of the batch
-                num_frames = num_frames if isinstance(num_frames, int) else num_frames[0]
-
+            if isinstance(num_frames, int):
                 weights = weights[..., : num_frames // 2]
+
+            elif isinstance(num_frames, (list, tuple, np.ndarray)) and len(np.unique(num_frames)) == 1:
+                weights = weights[..., : num_frames[0] // 2]
+
+            elif isinstance(num_frames, (torch.Tensor)) and len(torch.unique(num_frames)) == 1:
+                weights = weights[..., : num_frames[0] // 2]
+
             else:
                 # num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences
                 repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames)
@@ -231,7 +235,7 @@ class WhisperGenerationMixin:
 
         # Perform dynamic time warping on each element of the batch.
         for batch_idx in range(batch_size):
-            if num_frames is not None and isinstance(num_frames, (tuple, list, np.ndarray)):
+            if num_frames is not None and isinstance(num_frames, (tuple, list, np.ndarray, torch.Tensor)):
                 matrix = weights[batch_idx, ..., : num_frames[batch_idx] // 2]
 
                 # Normalize and smoothen the weights.
@@ -475,6 +479,7 @@ class WhisperGenerationMixin:
                 "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
                 FutureWarning,
             )
+
         # 1. prepare generation config
         generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
 
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index de1a9b57ac..123dbcdb67 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -443,11 +443,18 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
                     return_tensors="pt",
                 )
             else:
-                processed = self.feature_extractor(
-                    inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
-                )
-                if stride is None:
-                    extra["segment_size"] = len(inputs)
+                if self.type == "seq2seq_whisper" and stride is None:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_token_timestamps=True,
+                    )
+                    extra["num_frames"] = processed.pop("num_frames")
+                else:
+                    processed = self.feature_extractor(
+                        inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+                    )
 
             if self.torch_dtype is not None:
                 processed = processed.to(dtype=self.torch_dtype)
@@ -461,11 +468,11 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
     def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
         attention_mask = model_inputs.pop("attention_mask", None)
         stride = model_inputs.pop("stride", None)
-        segment_size = model_inputs.pop("segment_size", None)
+        num_frames = model_inputs.pop("num_frames", None)
         is_last = model_inputs.pop("is_last")
 
-        if stride is not None and segment_size is not None:
-            raise ValueError("segment_size must be used only when stride is None")
+        if stride is not None and num_frames is not None:
+            raise ValueError("num_frames must be used only when stride is None")
 
         if self.type in {"seq2seq", "seq2seq_whisper"}:
             encoder = self.model.get_encoder()
@@ -495,10 +502,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
                             generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
 
                     else:
-                        if isinstance(segment_size, int):
-                            generate_kwargs["num_frames"] = segment_size // self.feature_extractor.hop_length
-                        else:
-                            generate_kwargs["num_frames"] = segment_size[0] // self.feature_extractor.hop_length
+                        generate_kwargs["num_frames"] = num_frames
 
             if self.type == "seq2seq_whisper" and inputs.shape[-1] > self.feature_extractor.nb_max_frames:
                 generate_kwargs["input_features"] = inputs
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index fed1b9c059..58acb5f2fd 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1950,6 +1950,69 @@ class WhisperModelIntegrationTests(unittest.TestCase):
         transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
         self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
 
+    @slow
+    def test_large_timestamp_generation(self):
+        set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+        model.to(torch_device)
+
+        input_speech = np.concatenate(self._load_datasamples(4))
+        input_features = processor(
+            input_speech, return_tensors="pt", sampling_rate=16_000, return_token_timestamps=True
+        ).input_features
+        input_features = input_features.to(torch_device)
+
+        generated_ids = model.generate(input_features, max_length=448, return_timestamps=True).to("cpu")
+
+        # fmt: off
+        EXPECTED_OUTPUT = torch.tensor([50258, 50259, 50360, 50365, 2221, 13, 2326, 388, 391, 307, 264, 50244, 295, 264, 2808, 5359, 11, 293, 321, 366, 5404, 281, 2928, 702, 14943, 13, 50629, 50682, 6966, 307, 2221, 13, 2326, 388, 391, 311, 9060, 1570, 1880, 813, 702,  1871, 13, 50870, 50911, 634, 5112, 505, 300, 412, 341, 42729, 3196, 295, 264,  1064,  11, 365,  5272,   293, 12904,  9256, 450, 10539, 949, 505, 11, 51245, 51287,  1034, 4680, 10117, 490, 3936, 293, 1080,  3542, 5160, 881, 26336, 281, 264, 1575, 13, 51494, 51523, 634, 575, 12525, 22618, 1968,  6144, 35617, 1456, 397, 266, 311, 589, 307, 534, 10281, 934, 439, 11, 51799, 51815, 50257])
+        # fmt: on
+        self.assertTrue(torch.allclose(generated_ids, EXPECTED_OUTPUT))
+
+        EXPECTED_TRANSCRIPT = [
+            {
+                "text": (
+                    " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+                    " Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive"
+                    " season of the year, with Christmas and roast beef looming before us, similes drawn from eating"
+                    " and its results occur most readily to the mind. He has grave doubts whether Sir Frederick "
+                    "Leighton's work is really Greek after all,"
+                ),
+                "offsets": [
+                    {
+                        "text": (
+                            " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel."
+                        ),
+                        "timestamp": (0.0, 5.28),
+                    },
+                    {
+                        "text": " Nor is Mr. Quilter's manner less interesting than his matter.",
+                        "timestamp": (6.34, 10.1),
+                    },
+                    {
+                        "text": (
+                            " He tells us that at this festive season of the year, with Christmas and roast beef looming before us,"
+                        ),
+                        "timestamp": (10.92, 17.6),
+                    },
+                    {
+                        "text": (" similes drawn from eating and its results occur most readily to the mind."),
+                        "timestamp": (18.44, 22.580000000000002),
+                    },
+                    {
+                        "text": (
+                            " He has grave doubts whether Sir Frederick Leighton's work is really Greek after all,"
+                        ),
+                        "timestamp": (23.16, 28.68),
+                    },
+                ],
+            }
+        ]
+
+        transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True)
+        self.assertEqual(transcript, EXPECTED_TRANSCRIPT)
+
     @slow
     def test_tiny_token_timestamp_generation(self):
         set_seed(0)
@@ -1979,6 +2042,36 @@ class WhisperModelIntegrationTests(unittest.TestCase):
 
         self.assertTrue(torch.allclose(generate_outputs.token_timestamps.to("cpu"), EXPECTED_OUTPUT))
 
+    @slow
+    def test_large_token_timestamp_generation(self):
+        set_seed(0)
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
+        model.to(torch_device)
+
+        input_speech = self._load_datasamples(4)
+        input_features = processor(
+            input_speech, return_tensors="pt", sampling_rate=16_000, return_token_timestamps=True
+        )
+        input_features = input_features.to(torch_device)
+
+        generate_outputs = model.generate(
+            **input_features, max_length=448, return_timestamps=True, return_token_timestamps=True
+        )
+
+        self.assertEqual(generate_outputs.sequences.shape, generate_outputs.token_timestamps.shape)
+
+        # fmt: off
+        EXPECTED_OUTPUT = torch.tensor([
+            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.6200,  0.7400,  0.8600, 1.0000,  1.0400,  1.3000,  1.4400,  1.7800,  2.1800,  2.2800,  2.5000, 2.9200,  3.0000,  3.3800,  3.5000,  3.6000,  3.8400,  4.1000,  4.4000, 4.6800,  5.1400,  5.3600,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200, 5.8200,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200,  5.8200, 5.8200],
+            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.6000,  0.9200,  1.2200, 1.3400,  1.4200,  1.5400,  1.5800,  1.7400,  2.0600,  2.3800,  3.0400, 3.3800,  3.6400,  4.1200,  4.3600,  4.7800,  4.7800,  4.7800,  4.7800, 4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800, 4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800,  4.7800, 4.7800],
+            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.5400,  0.8200,  1.1600, 1.4600,  1.7400,  1.8800,  2.3400,  2.7400,  3.1400,  3.2200,  3.5400, 4.2800,  4.5600,  4.8200,  5.0600,  5.3200,  5.6600,  5.9600,  6.1400, 6.4000,  6.8400,  7.8800,  8.0200,  8.3600,  8.7000,  9.0200,  9.3200, 9.5000,  9.8400, 10.3000, 10.6600, 11.0800, 11.3600, 11.4600, 11.8000, 12.4600],
+            [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.5600,  0.7600,  1.0600, 1.4000,  1.8800,  2.2600,  2.6200,  2.8000,  2.9600,  3.0000,  3.2000, 3.4400,  3.6800,  4.0000,  4.6000,  5.0000,  5.3200,  5.4800,  6.0600, 6.0600,  6.1000,  6.3200,  6.7400,  7.0000,  7.2200,  7.4000,  7.7600, 8.0600,  8.5600,  8.8600,  8.9400,  9.1000,  9.3400,  9.8800,  9.8800, 9.8800]
+        ])
+        # fmt: on
+
+        self.assertTrue(torch.allclose(generate_outputs.token_timestamps.to("cpu"), EXPECTED_OUTPUT))
+
     @slow
     def test_tiny_token_timestamp_batch_generation(self):
         set_seed(0)

From 1834916481235948f413d3b965f96ccd3b2e44ae Mon Sep 17 00:00:00 2001
From: Goncalo Paulo <30472805+SrGonao@users.noreply.github.com>
Date: Mon, 20 May 2024 11:18:09 +0200
Subject: [PATCH 07/80] Fix num_hidden_layers in initialization of new model in
 Mamba (#30403)

Fix num_hidden_layers in initialization

Originally, the initialization was using config.num_layers instead of config.num_hidden_layers. This fixes that.
---
 src/transformers/models/mamba/modeling_mamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py
index 4834fd6247..bb67b62008 100644
--- a/src/transformers/models/mamba/modeling_mamba.py
+++ b/src/transformers/models/mamba/modeling_mamba.py
@@ -399,7 +399,7 @@ class MambaPreTrainedModel(PreTrainedModel):
                     # Having just p *= scale would repeatedly scale it down
                     nn.init.kaiming_uniform_(p, a=math.sqrt(5))
                     with torch.no_grad():
-                        p /= math.sqrt(self.config.num_layers)
+                        p /= math.sqrt(self.config.num_hidden_layers)
 
 
 @dataclass

From 7b4b4564386dc4e18bf38fc288d87700824d4191 Mon Sep 17 00:00:00 2001
From: Eric2i <eric2ihsiung@gmail.com>
Date: Mon, 20 May 2024 02:18:17 -0700
Subject: [PATCH 08/80] separate kwargs in processor (similar to #30193)
 (#30905)

* Fix similar bug in processor (related to #30193)

* Reformat processing_git.py to comply with ruff formatting
---
 src/transformers/models/git/processing_git.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 79f26f3bf2..98649c644e 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -76,15 +76,21 @@ class GitProcessor(ProcessorMixin):
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
+        tokenizer_kwargs, image_processor_kwargs = {}, {}
+        if kwargs:
+            tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys}
+            image_processor_kwargs = {
+                k: v for k, v in kwargs.items() if k in self.image_processor._valid_processor_keys
+            }
 
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **tokenizer_kwargs)
 
         if images is not None:
-            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+            image_features = self.image_processor(images, return_tensors=return_tensors, **image_processor_kwargs)
 
         if text is not None and images is not None:
             encoding["pixel_values"] = image_features.pixel_values

From c11ac7857bf93914f69a209d50a918ad7fe1d9a3 Mon Sep 17 00:00:00 2001
From: Hafedh <70411813+not-lain@users.noreply.github.com>
Date: Mon, 20 May 2024 10:38:32 +0100
Subject: [PATCH 09/80] fix for custom pipeline configuration  (#29004)

* fix for custom pipeline configuration

* fix for custom pipelines

* remove extra exception

* added test for custom pipelines extra tag

* format with ruff

* limit extra tag for first time only

* format with ruff

* improve tests for custom pipelines
---
 src/transformers/configuration_utils.py      |  5 +++++
 src/transformers/feature_extraction_utils.py | 14 ++++++++++----
 src/transformers/image_processing_utils.py   | 15 ++++++++++-----
 src/transformers/processing_utils.py         | 14 ++++++++++----
 src/transformers/tokenization_utils_base.py  | 20 +++++++++++++-------
 src/transformers/utils/__init__.py           |  1 +
 src/transformers/utils/generic.py            | 13 +++++++++++++
 tests/pipelines/test_pipelines_common.py     | 18 ++++++++++++++++++
 8 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index fc7f782e34..86b946d162 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -32,6 +32,7 @@ from .utils import (
     CONFIG_NAME,
     PushToHubMixin,
     add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
     cached_file,
     copy_func,
     download_url,
@@ -736,6 +737,10 @@ class PretrainedConfig(PushToHubMixin):
             config_dict["auto_map"] = add_model_info_to_auto_map(
                 config_dict["auto_map"], pretrained_model_name_or_path
             )
+        if "custom_pipelines" in config_dict and not is_local:
+            config_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
+                config_dict["custom_pipelines"], pretrained_model_name_or_path
+            )
         return config_dict, kwargs
 
     @classmethod
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 76070ebeb8..44e01f8d85 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -31,6 +31,7 @@ from .utils import (
     PushToHubMixin,
     TensorType,
     add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
     cached_file,
     copy_func,
     download_url,
@@ -539,10 +540,15 @@ class FeatureExtractionMixin(PushToHubMixin):
                 f"loading configuration file {feature_extractor_file} from cache at {resolved_feature_extractor_file}"
             )
 
-        if "auto_map" in feature_extractor_dict and not is_local:
-            feature_extractor_dict["auto_map"] = add_model_info_to_auto_map(
-                feature_extractor_dict["auto_map"], pretrained_model_name_or_path
-            )
+        if not is_local:
+            if "auto_map" in feature_extractor_dict:
+                feature_extractor_dict["auto_map"] = add_model_info_to_auto_map(
+                    feature_extractor_dict["auto_map"], pretrained_model_name_or_path
+                )
+            if "custom_pipelines" in feature_extractor_dict:
+                feature_extractor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
+                    feature_extractor_dict["custom_pipelines"], pretrained_model_name_or_path
+                )
 
         return feature_extractor_dict, kwargs
 
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
index efd1e04a62..c42378d8f3 100644
--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@@ -31,6 +31,7 @@ from .utils import (
     IMAGE_PROCESSOR_NAME,
     PushToHubMixin,
     add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
     cached_file,
     copy_func,
     download_url,
@@ -375,11 +376,15 @@ class ImageProcessingMixin(PushToHubMixin):
                 f"loading configuration file {image_processor_file} from cache at {resolved_image_processor_file}"
             )
 
-        if "auto_map" in image_processor_dict and not is_local:
-            image_processor_dict["auto_map"] = add_model_info_to_auto_map(
-                image_processor_dict["auto_map"], pretrained_model_name_or_path
-            )
-
+        if not is_local:
+            if "auto_map" in image_processor_dict:
+                image_processor_dict["auto_map"] = add_model_info_to_auto_map(
+                    image_processor_dict["auto_map"], pretrained_model_name_or_path
+                )
+            if "custom_pipelines" in image_processor_dict:
+                image_processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
+                    image_processor_dict["custom_pipelines"], pretrained_model_name_or_path
+                )
         return image_processor_dict, kwargs
 
     @classmethod
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index 463591032c..a8e47fb683 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -30,6 +30,7 @@ from .utils import (
     PROCESSOR_NAME,
     PushToHubMixin,
     add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
     cached_file,
     copy_func,
     direct_transformers_import,
@@ -355,10 +356,15 @@ class ProcessorMixin(PushToHubMixin):
         else:
             logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}")
 
-        if "auto_map" in processor_dict and not is_local:
-            processor_dict["auto_map"] = add_model_info_to_auto_map(
-                processor_dict["auto_map"], pretrained_model_name_or_path
-            )
+        if not is_local:
+            if "auto_map" in processor_dict:
+                processor_dict["auto_map"] = add_model_info_to_auto_map(
+                    processor_dict["auto_map"], pretrained_model_name_or_path
+                )
+            if "custom_pipelines" in processor_dict:
+                processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines(
+                    processor_dict["custom_pipelines"], pretrained_model_name_or_path
+                )
 
         return processor_dict, kwargs
 
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 395f9859cd..4cb75c9864 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -42,6 +42,7 @@ from .utils import (
     TensorType,
     add_end_docstrings,
     add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
     cached_file,
     copy_func,
     download_url,
@@ -2177,13 +2178,18 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
             config_tokenizer_class = None
             init_kwargs = init_configuration
 
-        if "auto_map" in init_kwargs and not _is_local:
-            # For backward compatibility with odl format.
-            if isinstance(init_kwargs["auto_map"], (tuple, list)):
-                init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
-            init_kwargs["auto_map"] = add_model_info_to_auto_map(
-                init_kwargs["auto_map"], pretrained_model_name_or_path
-            )
+        if not _is_local:
+            if "auto_map" in init_kwargs:
+                # For backward compatibility with odl format.
+                if isinstance(init_kwargs["auto_map"], (tuple, list)):
+                    init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
+                init_kwargs["auto_map"] = add_model_info_to_auto_map(
+                    init_kwargs["auto_map"], pretrained_model_name_or_path
+                )
+            if "custom_pipelines" in init_kwargs:
+                init_kwargs["custom_pipelines"] = add_model_info_to_custom_pipelines(
+                    init_kwargs["custom_pipelines"], pretrained_model_name_or_path
+                )
 
         if config_tokenizer_class is None:
             # Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo.
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 8c91463322..82d0b5001c 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -37,6 +37,7 @@ from .generic import (
     PaddingStrategy,
     TensorType,
     add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
     cached_property,
     can_return_loss,
     expand_dims,
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index a8277588ff..1f332434a9 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -721,6 +721,19 @@ def add_model_info_to_auto_map(auto_map, repo_id):
     return auto_map
 
 
+def add_model_info_to_custom_pipelines(custom_pipeline, repo_id):
+    """
+    Adds the information of the repo_id to a given custom pipeline.
+    """
+    # {custom_pipelines : {task: {"impl": "path.to.task"},...} }
+    for task in custom_pipeline.keys():
+        if "impl" in custom_pipeline[task]:
+            module = custom_pipeline[task]["impl"]
+            if "--" not in module:
+                custom_pipeline[task]["impl"] = f"{repo_id}--{module}"
+    return custom_pipeline
+
+
 def infer_framework(model_class):
     """
     Infers the framework of a given model without using isinstance(), because we cannot guarantee that the relevant
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 763c7d1a88..6ede7d1c7a 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -925,6 +925,24 @@ class DynamicPipelineTester(unittest.TestCase):
         # Can't make an isinstance check because the new_classifier is from the PairClassificationPipeline class of a
         # dynamic module
         self.assertEqual(new_classifier.__class__.__name__, "PairClassificationPipeline")
+        # check for tag exitence, tag needs to be added when we are calling a custom pipeline from the hub
+        # useful for cases such as finetuning
+        self.assertDictEqual(
+            new_classifier.model.config.custom_pipelines,
+            {
+                "pair-classification": {
+                    "impl": f"{USER}/test-dynamic-pipeline--custom_pipeline.PairClassificationPipeline",
+                    "pt": ("AutoModelForSequenceClassification",),
+                    "tf": (),
+                }
+            },
+        )
+        # test if the pipeline still works after the model is finetuned
+        # (we are actually testing if the pipeline still works from the final repo)
+        # this is where the user/repo--module.class is used for
+        new_classifier.model.push_to_hub(repo_name=f"{USER}/test-pipeline-for-a-finetuned-model", token=self._token)
+        del new_classifier  # free up memory
+        new_classifier = pipeline(model=f"{USER}/test-pipeline-for-a-finetuned-model", trust_remote_code=True)
 
         results = classifier("I hate you", second_text="I love you")
         new_results = new_classifier("I hate you", second_text="I love you")

From e6708709cb7ee2cc04df641403ed0671ee7806c6 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Mon, 20 May 2024 13:40:42 +0200
Subject: [PATCH 10/80] Add AutoFeatureExtractor support to
 Wav2Vec2ProcessorWithLM (#28706)

* Add AutoFeatureExtractor support to Wav2Vec2ProcessorWithLM

* update with a type filter

* add raises error test

* fix added test
---
 .../processing_wav2vec2_with_lm.py            | 21 ++++++++-----
 .../test_processor_wav2vec2_with_lm.py        | 31 ++++++++++++++++++-
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index b388be245f..9fceb1e61a 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -70,15 +70,15 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
     with language model support into a single processor for language model boosted speech recognition decoding.
 
     Args:
-        feature_extractor ([`Wav2Vec2FeatureExtractor`]):
-            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
+        feature_extractor ([`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]):
+            An instance of [`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
         tokenizer ([`Wav2Vec2CTCTokenizer`]):
             An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
         decoder (`pyctcdecode.BeamSearchDecoderCTC`):
             An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
     """
 
-    feature_extractor_class = "Wav2Vec2FeatureExtractor"
+    feature_extractor_class = "AutoFeatureExtractor"
     tokenizer_class = "Wav2Vec2CTCTokenizer"
 
     def __init__(
@@ -93,6 +93,11 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
         if not isinstance(decoder, BeamSearchDecoderCTC):
             raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
 
+        if feature_extractor.__class__.__name__ not in ["Wav2Vec2FeatureExtractor", "SeamlessM4TFeatureExtractor"]:
+            raise ValueError(
+                f"`feature_extractor` has to be of type `Wav2Vec2FeatureExtractor` or `SeamlessM4TFeatureExtractor`, but is {type(feature_extractor)}"
+            )
+
         # make sure that decoder's alphabet and tokenizer's vocab match in content
         missing_decoder_tokens = self.get_missing_alphabet_tokens(decoder, tokenizer)
         if len(missing_decoder_tokens) > 0:
@@ -117,7 +122,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
 
         <Tip>
 
-        This class method is simply calling Wav2Vec2FeatureExtractor's
+        This class method is simply calling the feature extractor's
         [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], Wav2Vec2CTCTokenizer's
         [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], and
         [`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`].
@@ -213,8 +218,8 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
 
     def __call__(self, *args, **kwargs):
         """
-        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
+        When used in normal mode, this method forwards all its arguments to the feature extractor's
+        [`~FeatureExtractionMixin.__call__`] and returns its output. If used in the context
         [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
         Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two
         methods for more information.
@@ -252,8 +257,8 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
 
     def pad(self, *args, **kwargs):
         """
-        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
+        When used in normal mode, this method forwards all its arguments to the feature extractor's
+        [`~FeatureExtractionMixin.pad`] and returns its output. If used in the context
         [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
         Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods
         for more information.
diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index 2c52a92165..61dee30091 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -25,7 +25,7 @@ import numpy as np
 from datasets import load_dataset
 from parameterized import parameterized
 
-from transformers import AutoProcessor
+from transformers import AutoFeatureExtractor, AutoProcessor
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
 from transformers.testing_utils import require_pyctcdecode, require_torch, require_torchaudio, slow
@@ -157,6 +157,35 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
         for key in input_feat_extract.keys():
             self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
 
+    def test_another_feature_extractor(self):
+        feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
+        tokenizer = self.get_tokenizer()
+        decoder = self.get_decoder()
+
+        processor = Wav2Vec2ProcessorWithLM(tokenizer=tokenizer, feature_extractor=feature_extractor, decoder=decoder)
+
+        raw_speech = floats_list((3, 1000))
+
+        input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
+        input_processor = processor(raw_speech, return_tensors="np")
+
+        for key in input_feat_extract.keys():
+            self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
+
+        self.assertListEqual(
+            processor.model_input_names,
+            feature_extractor.model_input_names,
+            msg="`processor` and `feature_extractor` model input names do not match",
+        )
+
+    def test_wrong_feature_extractor_raises_error(self):
+        feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v3")
+        tokenizer = self.get_tokenizer()
+        decoder = self.get_decoder()
+
+        with self.assertRaises(ValueError):
+            Wav2Vec2ProcessorWithLM(tokenizer=tokenizer, feature_extractor=feature_extractor, decoder=decoder)
+
     def test_tokenizer(self):
         feature_extractor = self.get_feature_extractor()
         tokenizer = self.get_tokenizer()

From 76e05301c3eb14c08a8c9e0dc3aadbc5a155f0e0 Mon Sep 17 00:00:00 2001
From: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com>
Date: Mon, 20 May 2024 13:55:57 +0200
Subject: [PATCH 11/80] Fix a shape annotation and typos in `mamba` slow
 forward (#30691)

* fix typos and one shape comment

* fix `intermediade` typo in jamba
---
 src/transformers/models/jamba/modeling_jamba.py | 10 +++++-----
 src/transformers/models/mamba/modeling_mamba.py | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index cfa80d2ce3..5a582acdd2 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -962,15 +962,15 @@ class JambaMambaMixer(nn.Module):
         # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
         A = -torch.exp(self.A_log.float())                                              # [intermediate_size, ssm_state_size]
         discrete_A = torch.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None]) # [batch, intermediate_size, seq_len, ssm_state_size]
-        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediade_size, seq_len, ssm_state_size]
+        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediate_size, seq_len, ssm_state_size]
         deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
         # 3.c perform the recurrence y ← SSM(A, B, C)(x)
         scan_outputs = []
         for i in range(seq_len):
-            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediade_size, ssm_state]
-            scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediade_size, 1]
+            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediate_size, ssm_state]
+            scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediate_size, 1]
             scan_outputs.append(scan_output[:, :, 0])
-        scan_output = torch.stack(scan_outputs, dim=-1)                                # [batch, intermediade_size, seq_len]
+        scan_output = torch.stack(scan_outputs, dim=-1)                                # [batch, intermediate_size, seq_len]
         scan_output = scan_output + (hidden_states * self.D[None, :, None])
         scan_output = (scan_output * self.act(gate))
 
@@ -978,7 +978,7 @@ class JambaMambaMixer(nn.Module):
             cache_params.ssm_states[self.layer_idx] = ssm_state
 
         # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_output.transpose(1, 2))             # [batch, seq_len, hidden_size]
+        contextualized_states = self.out_proj(scan_output.transpose(1, 2))  # [batch, seq_len, hidden_size]
         return contextualized_states
     # fmt: on
 
diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py
index bb67b62008..a8309e0430 100644
--- a/src/transformers/models/mamba/modeling_mamba.py
+++ b/src/transformers/models/mamba/modeling_mamba.py
@@ -279,16 +279,16 @@ class MambaMixer(nn.Module):
         # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
         A = -torch.exp(self.A_log.float())                                              # [intermediate_size, ssm_state_size]
         discrete_A = torch.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None]) # [batch, intermediate_size, seq_len, ssm_state_size]
-        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediade_size, seq_len, ssm_state_size]
+        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].float()       # [batch, intermediate_size, seq_len, ssm_state_size]
         deltaB_u = discrete_B * hidden_states[:, :, :, None].float()
 
         # 3.c perform the recurrence y ← SSM(A, B, C)(x)
         scan_outputs = []
         for i in range(seq_len):
-            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediade_size, ssm_state]
-            scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediade_size, 1]
+            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediate_size, ssm_state]
+            scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediate_size, 1]
             scan_outputs.append(scan_output[:, :, 0])
-        scan_output = torch.stack(scan_outputs, dim=-1)                                # [batch, seq_len, intermediade_size]
+        scan_output = torch.stack(scan_outputs, dim=-1)                                # [batch, intermediate_size, seq_len]
         scan_output = scan_output + (hidden_states * self.D[None, :, None])
         scan_output = (scan_output * self.act(gate))
 
@@ -296,7 +296,7 @@ class MambaMixer(nn.Module):
             cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
 
         # 4. Final linear projection
-        contextualized_states = self.out_proj(scan_output.transpose(1, 2))             # [batch, seq_len, hidden_size]
+        contextualized_states = self.out_proj(scan_output.transpose(1, 2))  # [batch, seq_len, hidden_size]
         return contextualized_states
     # fmt: on
 

From bb48e921868ac750417956de941606f7e2fa02ca Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Mon, 20 May 2024 13:56:11 +0200
Subject: [PATCH 12/80] `tokenizer_class = "AutoTokenizer"` Llava Family
 (#30912)

propagate changes to more models
---
 src/transformers/models/llava_next/processing_llava_next.py   | 4 ++--
 src/transformers/models/video_llava/processing_video_llava.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 8a4b76e9c6..91cd544ab6 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -40,8 +40,8 @@ class LlavaNextProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "LlavaNextImageProcessor"
-    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 0355d756ce..b71600501b 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -42,7 +42,7 @@ class VideoLlavaProcessor(ProcessorMixin):
 
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "VideoLlavaImageProcessor"
-    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)

From 92d1d97c05a01160d6e7fcf4198e93bf2cec0dfe Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Mon, 20 May 2024 09:21:40 -0400
Subject: [PATCH 13/80] Introduce configured_state arg for accelerator_config
 (#29781)

* Introduce configured_state

* Include note on tuning

* Allow for users to have defined a state already

* Include tests

* Add note on hpam tune

* Guard a bit better

* Update src/transformers/training_args.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/training_args.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Finish rebase

* Finish rebase

* Guard carefully

* Fixup test

* Refactor

* Fin refactor

* Comment

* Update wrt feedback

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/trainer_pt_utils.py |  14 +++
 src/transformers/training_args.py    | 134 ++++++++++++++++-----------
 tests/trainer/test_trainer.py        |  14 +++
 3 files changed, 110 insertions(+), 52 deletions(-)

diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 9defa91b2b..8ac0281912 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -1250,6 +1250,10 @@ class AcceleratorConfig:
             Whether to use non-blocking CUDA calls to help minimize synchronization during
             distributed training with prepared `DataLoader` inputs being moved to device.
             Best if used with `pin_memory=True` in the `TrainingArguments`.
+        use_configured_state (`bool*, *optional*, defaults to `False`):
+            Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined
+            before calling `TrainingArguments`. If `True`, an `Accelerator` or `PartialState`
+            must be initialized. May lead to issues using sweeps or hyperparameter tuning.
 
     """
 
@@ -1312,6 +1316,13 @@ class AcceleratorConfig:
             "    The [`accelerate.utils.GradientAccumulationPlugin`] default is `False`."
         },
     )
+    use_configured_state: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`."
+            "If `True`, an `Accelerator` or `PartialState` must be initialized. May lead to issues using sweeps or hyperparameter tuning."
+        },
+    )
 
     @classmethod
     def from_json_file(cls, json_file):
@@ -1331,6 +1342,9 @@ class AcceleratorConfig:
     def to_dict(self):
         return copy.deepcopy(self.__dict__)
 
+    def pop(self, key, default=None):
+        return self.__dict__.pop(key, default)
+
 
 class LayerWiseDummyOptimizer(torch.optim.Optimizer):
     """
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 2807c9951a..08d9000cb6 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -572,6 +572,10 @@ class TrainingArguments:
                     training results are fully reproducable using a different sampling technique. While seed-to-seed results
                     may differ, on average the differences are neglible when using multiple different seeds to compare. Should
                     also be ran with [`~utils.set_seed`] for the best results.
+                - use_configured_state (`bool`, *optional*, defaults to `False`):
+                    Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`.
+                    If `True`, an `Accelerator` or `PartialState` must be initialized. Note that by doing so, this could lead to issues
+                    with hyperparameter tuning.
 
         label_smoothing_factor (`float`, *optional*, defaults to 0.0):
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
@@ -1635,6 +1639,39 @@ class TrainingArguments:
             if version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.0") and self.fp16:
                 raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0")
 
+        # We need to setup the accelerator config here *before* the first call to `self.device`
+        if is_accelerate_available():
+            if not isinstance(self.accelerator_config, (AcceleratorConfig)):
+                if self.accelerator_config is None:
+                    self.accelerator_config = AcceleratorConfig()
+                elif isinstance(self.accelerator_config, dict):
+                    self.accelerator_config = AcceleratorConfig(**self.accelerator_config)
+                # Check that a user didn't pass in the class instantiator
+                # such as `accelerator_config = AcceleratorConfig`
+                elif isinstance(self.accelerator_config, type):
+                    raise NotImplementedError(
+                        "Tried passing in a callable to `accelerator_config`, but this is not supported. "
+                        "Please pass in a fully constructed `AcceleratorConfig` object instead."
+                    )
+                else:
+                    self.accelerator_config = AcceleratorConfig.from_json_file(self.accelerator_config)
+
+            if self.dispatch_batches is not None:
+                warnings.warn(
+                    "Using `--dispatch_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
+                    " `--accelerator_config {'dispatch_batches':VALUE} instead",
+                    FutureWarning,
+                )
+                self.accelerator_config.dispatch_batches = self.dispatch_batches
+
+            if self.split_batches is not None:
+                warnings.warn(
+                    "Using `--split_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
+                    " `--accelerator_config {'split_batches':VALUE} instead",
+                    FutureWarning,
+                )
+                self.accelerator_config.split_batches = self.split_batches
+
         if (
             self.framework == "pt"
             and is_torch_available()
@@ -1873,37 +1910,6 @@ class TrainingArguments:
 
             os.environ[f"{prefix}USE_ORIG_PARAMS"] = str(self.fsdp_config.get("use_orig_params", "true")).lower()
 
-        if is_accelerate_available():
-            if not isinstance(self.accelerator_config, (AcceleratorConfig)):
-                if self.accelerator_config is None:
-                    self.accelerator_config = AcceleratorConfig()
-                elif isinstance(self.accelerator_config, dict):
-                    self.accelerator_config = AcceleratorConfig(**self.accelerator_config)
-                # Check that a user didn't pass in the class instantiator
-                # such as `accelerator_config = AcceleratorConfig`
-                elif isinstance(self.accelerator_config, type):
-                    raise NotImplementedError(
-                        "Tried passing in a callable to `accelerator_config`, but this is not supported. "
-                        "Please pass in a fully constructed `AcceleratorConfig` object instead."
-                    )
-                else:
-                    self.accelerator_config = AcceleratorConfig.from_json_file(self.accelerator_config)
-            if self.dispatch_batches is not None:
-                warnings.warn(
-                    "Using `--dispatch_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
-                    " `--accelerator_config {'dispatch_batches':VALUE} instead",
-                    FutureWarning,
-                )
-                self.accelerator_config.dispatch_batches = self.dispatch_batches
-
-            if self.split_batches is not None:
-                warnings.warn(
-                    "Using `--split_batches` is deprecated and will be removed in version 4.41 of 🤗 Transformers. Use"
-                    " `--accelerator_config {'split_batches':VALUE} instead",
-                    FutureWarning,
-                )
-                self.accelerator_config.split_batches = self.split_batches
-
         if self.tpu_metrics_debug:
             warnings.warn(
                 "using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use"
@@ -2056,32 +2062,62 @@ class TrainingArguments:
                     f"Using the `Trainer` with `PyTorch` requires `accelerate>={ACCELERATE_MIN_VERSION}`: "
                     "Please run `pip install transformers[torch]` or `pip install accelerate -U`"
                 )
+        # We delay the init of `PartialState` to the end for clarity
+        accelerator_state_kwargs = {"enabled": True, "use_configured_state": False}
+        if isinstance(self.accelerator_config, AcceleratorConfig):
+            accelerator_state_kwargs["use_configured_state"] = self.accelerator_config.pop(
+                "use_configured_state", False
+            )
+        if accelerator_state_kwargs["use_configured_state"]:
+            if PartialState._shared_state == {}:
+                raise ValueError(
+                    "Passing `'use_configured_state':True` to the AcceleratorConfig requires a pre-configured "
+                    "`AcceleratorState` or `PartialState` to be defined before calling `TrainingArguments`. "
+                )
+            # We rely on `PartialState` to yell if there's issues here (which it will)
+            self.distributed_state = PartialState(cpu=self.use_cpu)
+            if self.deepspeed and self.distributed_state.distributed_type != DistributedType.DEEPSPEED:
+                raise RuntimeError(
+                    "Tried to use an already configured `Accelerator` or `PartialState` that was not initialized for DeepSpeed, "
+                    "but also passed in a `deepspeed` configuration to the `TrainingArguments`. Please set "
+                    "`use_configured_state:False` instead or setup your `Accelerator` or `PartialState` properly."
+                )
+        else:
             AcceleratorState._reset_state(reset_partial_state=True)
-        self.distributed_state = None
+            self.distributed_state = None
         if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ:
             os.environ["ACCELERATE_USE_IPEX"] = "false"
+
+        self._n_gpu = 1
         if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")):
-            self.distributed_state = PartialState(cpu=True, backend=self.ddp_backend)
+            accelerator_state_kwargs["cpu"] = True
+            accelerator_state_kwargs["backend"] = self.ddp_backend
             self._n_gpu = 0
         elif is_sagemaker_mp_enabled():
+            accelerator_state_kwargs["enabled"] = False
             local_rank = smp.local_rank()
             device = torch.device("cuda", local_rank)
-            self._n_gpu = 1
             torch.cuda.set_device(device)
         elif is_sagemaker_dp_enabled():
-            self.distributed_state = PartialState(_use_sagemaker_dp=True)
-            self._n_gpu = 1
+            accelerator_state_kwargs["_use_sagemaker_dp"] = True
         elif self.deepspeed:
-            # Need to do similar for Accelerator init
-            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
-            self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
-            del os.environ["ACCELERATE_USE_DEEPSPEED"]
-            self._n_gpu = 1
+            accelerator_state_kwargs["use_deepspeed"] = True
+            accelerator_state_kwargs["timeout"] = timedelta(seconds=self.ddp_timeout)
         else:
-            self.distributed_state = PartialState(
-                backend=self.ddp_backend, timeout=timedelta(seconds=self.ddp_timeout)
-            )
-            self._n_gpu = 1
+            accelerator_state_kwargs["backend"] = self.ddp_backend
+            accelerator_state_kwargs["timeout"] = timedelta(seconds=self.ddp_timeout)
+
+        # Now we pop everything
+        if accelerator_state_kwargs.pop("enabled", False) and not accelerator_state_kwargs.pop(
+            "use_configured_state", False
+        ):
+            # We need to patch this env var when enabling to detect deepspeed
+            use_deepspeed = accelerator_state_kwargs.pop("use_deepspeed", False)
+            if use_deepspeed:
+                os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+            self.distributed_state = PartialState(**accelerator_state_kwargs)
+            if use_deepspeed:
+                del os.environ["ACCELERATE_USE_DEEPSPEED"]
         if not is_sagemaker_mp_enabled():
             device = self.distributed_state.device
             self.local_rank = self.distributed_state.local_process_index
@@ -2108,23 +2144,17 @@ class TrainingArguments:
                         "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ "
                         "or current PyTorch install was not built with MPS enabled."
                     )
-            if device.type == "mps":
-                self._n_gpu = 1
-            elif self.use_cpu:
+            if self.use_cpu:
                 device = torch.device("cpu")
-                self._n_gpu = 0
             elif is_torch_xpu_available():
                 device = torch.device("xpu:0")
                 torch.xpu.set_device(device)
-                self._n_gpu = 1
             elif is_torch_mlu_available():
                 device = torch.device("mlu:0")
                 torch.mlu.set_device(device)
-                self._n_gpu = 1
             elif is_torch_npu_available():
                 device = torch.device("npu:0")
                 torch.npu.set_device(device)
-                self._n_gpu = 1
             else:
                 # if n_gpu is > 1 we'll use nn.DataParallel.
                 # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index c420da4052..1711f600ce 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -131,6 +131,10 @@ if is_torch_available():
 # for version specific tests in TrainerIntegrationTest
 require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28")
 GRAD_ACCUM_KWARGS_VERSION_AVAILABLE = is_accelerate_available("0.28")
+if is_accelerate_available():
+    from accelerate import Accelerator
+    from accelerate.state import AcceleratorState
+
 
 PATH_SAMPLE_TEXT = f"{get_tests_dir()}/fixtures/sample_text.txt"
 
@@ -3266,6 +3270,16 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 trainer = Trainer(model=model, args=args, eval_dataset=eval_dataset)
                 self.assertEqual(trainer.accelerator.split_batches, True)
 
+    def test_accelerator_custom_state(self):
+        AcceleratorState._reset_state(reset_partial_state=True)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            with self.assertRaises(ValueError) as cm:
+                _ = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config={"use_configured_state": True})
+                self.assertIn("Please define this beforehand", str(cm.warnings[0].message))
+            _ = Accelerator()
+            _ = RegressionTrainingArguments(output_dir=tmp_dir, accelerator_config={"use_configured_state": True})
+        AcceleratorState._reset_state(reset_partial_state=True)
+
     @require_accelerate_version_min_0_28
     def test_accelerator_config_from_dict_grad_accum_num_steps(self):
         with tempfile.TemporaryDirectory() as tmp_dir:

From 616bb11d487aabc231bb230b245c42214ea4b254 Mon Sep 17 00:00:00 2001
From: Longjie Zheng <32992656+zhenglongjiepheonix@users.noreply.github.com>
Date: Mon, 20 May 2024 10:27:24 -0400
Subject: [PATCH 14/80] Add torch.compile for Mistral (#30642)

* first version

* fix sliding window

* fix style

* add sliding window cache

* fix style

* address comments

* fix test

* fix style

* move sliding window check inside cache init

* revert changes on irrelevant files & add comment on SlidingWindowCache

* address comments & fix style

fix style

* update causal mask

* [run-slow] mistral

* [run-slow] mistral

* [run-slow] mistral

* [run-slow] mistral

* [run-slow] mistral

* [run-slow] llama

* [run-slow] mistral

* [run-slow] mistral

* [run-slow] mistral

* revert CI from a10 to t4

* wrap up
---
 docs/source/en/llm_optims.md                  |   2 +-
 src/transformers/cache_utils.py               | 121 +++++
 src/transformers/generation/utils.py          |  53 ++-
 .../open_llama/modeling_open_llama.py         |   4 +-
 .../models/falcon/modeling_falcon.py          |   4 +-
 .../models/gemma/modeling_gemma.py            |   5 +-
 .../models/gpt_neox/modeling_gpt_neox.py      |   4 +-
 .../modeling_gpt_neox_japanese.py             |   2 +-
 .../models/idefics/modeling_idefics.py        |   2 +-
 .../models/llama/modeling_llama.py            |   2 -
 .../models/mistral/modeling_mistral.py        | 421 +++++++++++-------
 .../models/mixtral/modeling_mixtral.py        |  20 +-
 .../models/persimmon/modeling_persimmon.py    |   4 +-
 src/transformers/models/phi/modeling_phi.py   |   4 +-
 .../models/qwen2/modeling_qwen2.py            |   6 +-
 .../models/qwen2_moe/modeling_qwen2_moe.py    |   6 +-
 .../models/stablelm/modeling_stablelm.py      |   4 +-
 .../models/starcoder2/modeling_starcoder2.py  |  13 +-
 tests/models/mistral/test_modeling_mistral.py |  72 ++-
 19 files changed, 512 insertions(+), 237 deletions(-)

diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md
index 4b44c1d78c..5e49f0e1eb 100644
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@@ -29,7 +29,7 @@ To optimize this, you can use a kv-cache to store the past keys and values inste
 The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with torch.compile for up to a 4x speed up.
 
 > [!WARNING]
-> Currently, only [Command R](./model_doc/cohere), [Gemma](./model_doc/gemma) and [Llama](./model_doc/llama2) models support static kv-cache and torch.compile.
+> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and torch.compile. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
 
 For this example, let's load the [Gemma](https://hf.co/google/gemma-2b) model.
 
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 2e29e19ade..990a863e18 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -448,3 +448,124 @@ class StaticCache(Cache):
             # In-place ops prevent breaking the static address
             self.key_cache[layer_idx].zero_()
             self.value_cache[layer_idx].zero_()
+
+
+class SlidingWindowCache(Cache):
+    """
+    Sliding Window Cache class to be used with `torch.compile` for models like Mistral that support sliding window attention.
+    Every time when we try to update the cache, we compute the `indices` based on `cache_position >= self.config.sliding_window_size - 1`,
+    if true(which means the cache can not hold all the old key value states and new states together because of the sliding window constraint),
+    we need to do a cycle shift based on `indices` to replace the oldest states by the new key value states passed in.
+
+    The `to_shift` is only true once we are above sliding_window_size. Thus with `sliding_window_size==64`:
+
+    indices = (slicing + to_shift[-1].int()-1) % self.config.sliding_window_size
+    tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+        37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+        55, 56, 57, 58, 59, 60, 61, 62, 63,  0])
+
+    We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window_size`)
+
+    Parameters:
+        config (`PretrainedConfig):
+            The configuration file defining the shape-related attributes required to initialize the static cache.
+        max_batch_size (`int`):
+            The maximum batch size with which the model will be used.
+        max_cache_len (`int`):
+            The maximum sequence length with which the model will be used.
+        device (`torch.device`):
+            The device on which the cache should be initialized. Should be the same as the layer.
+        dtype (*optional*, defaults to `torch.float32`):
+            The default `dtype` to use when initializing the layer.
+    """
+
+    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
+        if not hasattr(config, "sliding_window") or config.sliding_window is None:
+            raise ValueError(
+                "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
+                "sliding window attention, please check if there is a `sliding_window` field in the model "
+                "config and it's not set to None."
+            )
+
+        super().__init__()
+        self.max_batch_size = max_batch_size
+        # take the minimum of max_cache_len and config.sliding_window so that we allocate less memory
+        # when we do short-sentence generation
+        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
+        self.model_sliding_window_size = config.sliding_window
+        self.sliding_window_size = min(self.max_cache_len, self.model_sliding_window_size)
+        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
+        self.head_dim = (
+            config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
+        )
+
+        self.dtype = dtype if dtype is not None else torch.float32
+        self.num_key_value_heads = (
+            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
+        )
+
+        cache_shape = (
+            config.num_hidden_layers,
+            max_batch_size,
+            self.num_key_value_heads,
+            self.sliding_window_size,
+            self.head_dim,
+        )
+
+        self.key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+        self.value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+
+        torch._dynamo.mark_static_address(self.key_cache)
+        torch._dynamo.mark_static_address(self.value_cache)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor]:
+        cache_position = cache_kwargs.get("cache_position")
+        k_out = self.key_cache[layer_idx]
+        v_out = self.value_cache[layer_idx]
+
+        # assume this only happens in prefill phase when prompt length > sliding_window_size
+        if cache_position.shape[0] > self.sliding_window_size:
+            k_out = key_states[:, :, -self.sliding_window_size :, :]
+            v_out = value_states[:, :, -self.sliding_window_size :, :]
+            self.key_cache[layer_idx] = k_out
+            self.value_cache[layer_idx] = v_out
+            # we should return the whole states instead of k_out, v_out to take the whole prompt
+            # into consideration when building kv cache instead of just throwing away tokens outside of the window
+            return key_states, value_states
+
+        slicing = torch.ones(self.sliding_window_size, dtype=torch.long, device=value_states.device).cumsum(0)
+        cache_position = cache_position.clamp(0, self.sliding_window_size - 1)
+        to_shift = cache_position >= self.sliding_window_size - 1
+        indices = (slicing + to_shift[-1].int() - 1) % self.sliding_window_size
+
+        k_out = k_out[:, :, indices]
+        v_out = v_out[:, :, indices]
+
+        k_out[:, :, cache_position] = key_states
+        v_out[:, :, cache_position] = value_states
+
+        self.key_cache[layer_idx] = k_out
+        self.value_cache[layer_idx] = v_out
+
+        return k_out, v_out
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        # assume this will be called only in the first generation step
+        # `cache_postion` will be used in other cases
+        return 0
+
+    def get_max_length(self) -> Optional[int]:
+        # in theory there is no limit because the sliding window size is fixed
+        # no matter how long the sentence is
+        return None
+
+    def reset(self):
+        self.key_cache.zero_()
+        self.value_cache.zero_()
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 4b71fe9519..bfaa665979 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -24,7 +24,7 @@ import torch
 import torch.distributed as dist
 from torch import nn
 
-from ..cache_utils import Cache, DynamicCache, StaticCache
+from ..cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..models.auto import (
@@ -96,9 +96,7 @@ logger = logging.get_logger(__name__)
 if is_accelerate_available():
     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
 
-NEED_SETUP_CACHE_CLASSES_MAPPING = {
-    "static": StaticCache,
-}
+NEED_SETUP_CACHE_CLASSES_MAPPING = {"static": StaticCache, "sliding_window": SlidingWindowCache}
 
 
 @dataclass
@@ -1326,24 +1324,33 @@ class GenerationMixin:
         model_kwargs["cache_position"] = torch.arange(past_length, cur_len, device=input_ids.device)
         return model_kwargs
 
-    def _get_static_cache(self, max_batch_size: int, max_cache_len: int) -> StaticCache:
+    def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_len: int) -> Cache:
         """
-        Sets a static cache for `generate`, that will persist across calls. A new cache will only be initialized a
+        Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
         new `generate` call requires a larger cache.
 
-        Returns the resulting static cache object.
+        Returns the resulting cache object.
         """
-        needs_new_cache = (
-            not hasattr(self, "_static_cache")
-            or self._static_cache.max_batch_size < max_batch_size
-            or self._static_cache.max_cache_len < max_cache_len
+        cache_cls: Cache = NEED_SETUP_CACHE_CLASSES_MAPPING[cache_implementation]
+        need_new_cache = (
+            not hasattr(self, "_cache")
+            or (not isinstance(self._cache, cache_cls))
+            or self._cache.max_batch_size < max_batch_size
         )
-        if needs_new_cache:
+        if cache_implementation == "sliding_window":
+            need_new_cache = need_new_cache or (
+                self._cache.sliding_window_size < self._cache.model_sliding_window_size
+                and max_cache_len > self._cache.max_cache_len
+            )
+        elif cache_implementation == "static":
+            need_new_cache = need_new_cache or self._cache.max_cache_len < max_cache_len
+
+        if need_new_cache:
             if hasattr(self.config, "_pre_quantization_dtype"):
                 cache_dtype = self.config._pre_quantization_dtype
             else:
                 cache_dtype = self.dtype
-            self._static_cache = StaticCache(
+            self._cache = cache_cls(
                 config=self.config,
                 max_batch_size=max_batch_size,
                 max_cache_len=max_cache_len,
@@ -1351,8 +1358,8 @@ class GenerationMixin:
                 dtype=cache_dtype,
             )
         else:
-            self._static_cache.reset()  # reset the cache for a new generation
-        return self._static_cache
+            self._cache.reset()
+        return self._cache
 
     def _prepare_special_tokens(
         self,
@@ -1615,14 +1622,14 @@ class GenerationMixin:
                     "This model does not support the `cache_implementation` argument. Please check the following "
                     "issue: https://github.com/huggingface/transformers/issues/28981."
                 )
-            if generation_config.cache_implementation == "static":
-                if not self._supports_static_cache:
-                    raise ValueError(
-                        "This model does not support `cache_implementation='static'`. Please check the following "
-                        "issue: https://github.com/huggingface/transformers/issues/28981"
-                    )
-                model_kwargs["past_key_values"] = self._get_static_cache(batch_size, generation_config.max_length)
-
+            if generation_config.cache_implementation == "static" and not self._supports_static_cache:
+                raise ValueError(
+                    "This model does not support `cache_implementation='static'`. Please check the following "
+                    "issue: https://github.com/huggingface/transformers/issues/28981"
+                )
+            model_kwargs["past_key_values"] = self._get_cache(
+                generation_config.cache_implementation, batch_size, generation_config.max_length
+            )
         self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
 
         # 7. determine generation mode
diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index 098f8c7da5..4e42d716e8 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -63,7 +63,7 @@ class OpenLlamaRMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->OpenLlama
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->OpenLlama
 class OpenLlamaRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -154,7 +154,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index b9fbf8d70b..75346601d7 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -81,7 +81,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -123,7 +123,7 @@ def _get_unpad_data(attention_mask):
     )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Falcon
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Falcon
 class FalconRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 200f91f750..6003c91cee 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -253,7 +253,6 @@ class GemmaAttention(nn.Module):
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
@@ -265,8 +264,8 @@ class GemmaAttention(nn.Module):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, None)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index e0b2309fc9..4980f7c636 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -522,7 +522,7 @@ def attention_mask_func(attention_scores, ltor_mask):
 
 
 class GPTNeoXRotaryEmbedding(nn.Module):
-    # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding.__init__
+    # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding.__init__
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
 
@@ -614,7 +614,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index ea934581aa..24d2113178 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -230,7 +230,7 @@ class GPTNeoXJapaneseAttention(nn.Module):
 
 # Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoXRotaryEmbedding->RotaryEmbedding
 class RotaryEmbedding(nn.Module):
-    # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding.__init__
+    # Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding.__init__
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
 
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 68ec6ab8a5..2ed51413d9 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -478,7 +478,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 5d8a3f987a..1f4f6ac9a0 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -303,7 +303,6 @@ class LlamaAttention(nn.Module):
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
@@ -402,7 +401,6 @@ class LlamaFlashAttention2(LlamaAttention):
         output_attentions: bool = False,
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index a524c5958e..c54b8774ee 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Mistral model."""
+"""PyTorch Mistral model."""
+
 import inspect
 import math
 from typing import List, Optional, Tuple, Union
@@ -29,8 +30,8 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
+from ...cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -92,8 +93,6 @@ class MistralRMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
 
 
-# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
 class MistralRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -104,30 +103,22 @@ class MistralRotaryEmbedding(nn.Module):
         inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
 
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
+    @torch.no_grad()
+    # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward
+    def forward(self, x, position_ids):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
 # Copied from transformers.models.llama.modeling_llama.rotate_half
@@ -138,9 +129,8 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-# TODO @Arthur no longer copied from LLama after static cache
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -148,9 +138,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
@@ -161,8 +150,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -213,6 +202,7 @@ class MistralAttention(nn.Module):
                 "when creating this class."
             )
 
+        self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
@@ -221,7 +211,6 @@ class MistralAttention(nn.Module):
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
-        self.attention_dropout = config.attention_dropout
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
@@ -231,7 +220,7 @@ class MistralAttention(nn.Module):
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
 
         self.rotary_emb = MistralRotaryEmbedding(
             self.head_dim,
@@ -239,9 +228,7 @@ class MistralAttention(nn.Module):
             base=self.rope_theta,
         )
 
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
+    # Copied from transformers.models.gemma.modeling_gemma.GemmaAttention.forward with Gemma->Mistral
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -250,6 +237,7 @@ class MistralAttention(nn.Module):
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
 
@@ -261,41 +249,22 @@ class MistralAttention(nn.Module):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
-        # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-
-            attn_weights = attn_weights + attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
 
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -309,8 +278,8 @@ class MistralAttention(nn.Module):
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
+        attn_output = attn_output.view(bsz, q_len, -1)
         attn_output = self.o_proj(attn_output)
 
         if not output_attentions:
@@ -343,7 +312,16 @@ class MistralFlashAttention2(MistralAttention):
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
     ):
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        output_attentions = False
+
         bsz, q_len, _ = hidden_states.size()
 
         query_states = self.q_proj(hidden_states)
@@ -356,19 +334,10 @@ class MistralFlashAttention2(MistralAttention):
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            kv_seq_len += cache_position[0]
 
-        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         use_sliding_windows = (
             _flash_supports_window_size
@@ -605,8 +574,7 @@ class MistralFlashAttention2(MistralAttention):
         )
 
 
-# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
-# TODO @Arthur no longer copied from LLama after static cache
+# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral
 class MistralSdpaAttention(MistralAttention):
     """
     Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -623,6 +591,7 @@ class MistralSdpaAttention(MistralAttention):
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -637,6 +606,7 @@ class MistralSdpaAttention(MistralAttention):
                 past_key_value=past_key_value,
                 output_attentions=output_attentions,
                 use_cache=use_cache,
+                cache_position=cache_position,
             )
 
         bsz, q_len, _ = hidden_states.size()
@@ -649,43 +619,38 @@ class MistralSdpaAttention(MistralAttention):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        cos, sin = self.rotary_emb(value_states, position_ids)
 
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
         if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
+        causal_mask = attention_mask
         if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
 
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and attention_mask is not None:
+        if query_states.device.type == "cuda" and causal_mask is not None:
             query_states = query_states.contiguous()
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-        is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+        is_causal = True if causal_mask is None and q_len > 1 else False
 
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask,
+            attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             is_causal=is_causal,
         )
@@ -705,12 +670,13 @@ MISTRAL_ATTENTION_CLASSES = {
 }
 
 
+# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Mistral, LLAMA->MISTRAL
 class MistralDecoderLayer(nn.Module):
     def __init__(self, config: MistralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
 
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
 
         self.mlp = MistralMLP(config)
         self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -721,15 +687,17 @@ class MistralDecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, sequence_length)` where padding elements are indicated by 0.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -751,6 +719,7 @@ class MistralDecoderLayer(nn.Module):
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
+            cache_position=cache_position,
         )
         hidden_states = residual + hidden_states
 
@@ -801,6 +770,7 @@ class MistralPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
+    _supports_static_cache = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -924,12 +894,13 @@ class MistralModel(MistralPreTrainedModel):
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -940,74 +911,38 @@ class MistralModel(MistralPreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        past_key_values_length = 0
-
-        if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            return_legacy_cache = True
 
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )
 
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, use_cache, output_attentions
+        )
+
         hidden_states = inputs_embeds
 
         # decoder layers
@@ -1023,20 +958,22 @@ class MistralModel(MistralPreTrainedModel):
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    attention_mask,
+                    causal_mask,
                     position_ids,
                     past_key_values,
                     output_attentions,
                     use_cache,
+                    cache_position,
                 )
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
+                    attention_mask=causal_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
+                    cache_position=cache_position,
                 )
 
             hidden_states = layer_outputs[0]
@@ -1053,9 +990,9 @@ class MistralModel(MistralPreTrainedModel):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
 
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
@@ -1066,6 +1003,113 @@ class MistralModel(MistralPreTrainedModel):
             attentions=all_self_attns,
         )
 
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        use_cache: bool,
+        output_attentions: bool,
+    ):
+        # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
+        # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
+        # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
+        # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
+
+        if self._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and use_cache:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+
+        # cache_position must be valid here no matter which cache we use
+        past_seen_tokens = cache_position[0] if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache
+        if using_sliding_window_cache:
+            target_length = max(sequence_length, self.config.sliding_window)
+        # StaticCache
+        elif using_static_cache:
+            target_length = past_key_values.get_max_length()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            exclude_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if self.config.sliding_window is not None:
+                if not using_sliding_window_cache or sequence_length > self.config.sliding_window:
+                    exclude_mask |= torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - self.config.sliding_window
+                    )
+            causal_mask *= exclude_mask
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.dim() == 2:
+                    mask_length = attention_mask.shape[-1]
+                    padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                    padding_mask = padding_mask == 0
+                    causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                        padding_mask, min_dtype
+                    )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
 
 class MistralForCausalLM(MistralPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
@@ -1104,13 +1148,14 @@ class MistralForCausalLM(MistralPreTrainedModel):
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1155,6 +1200,7 @@ class MistralForCausalLM(MistralPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         hidden_states = outputs[0]
@@ -1187,14 +1233,27 @@ class MistralForCausalLM(MistralPreTrainedModel):
         )
 
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        use_cache=True,
+        **kwargs,
     ):
         # Omit tokens covered by past_key_values
+        past_length = 0
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
+                max_cache_length = (
+                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
+                    if past_key_values.get_max_length() is not None
+                    else None
+                )
+                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
+            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
@@ -1227,17 +1286,33 @@ class MistralForCausalLM(MistralPreTrainedModel):
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
 
+        # crop the attention_mask to sliding window size during decode phase if using SlidingWindowCache
+        if (
+            past_length > 0
+            and attention_mask is not None
+            and isinstance(past_key_values, SlidingWindowCache)
+            and attention_mask.shape[1] > past_key_values.sliding_window_size
+        ):
+            attention_mask = attention_mask[:, -past_key_values.sliding_window_size :]
+
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
-            model_inputs = {"input_ids": input_ids}
+            model_inputs = {"input_ids": input_ids.contiguous()}
+
+        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+        if cache_position is None:
+            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
+        elif use_cache:
+            cache_position = cache_position[-input_length:]
 
         model_inputs.update(
             {
                 "position_ids": position_ids,
+                "cache_position": cache_position,
                 "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
+                "use_cache": use_cache,
                 "attention_mask": attention_mask,
             }
         )
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index dfdd114728..70e1746392 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -181,7 +181,8 @@ class MixtralRMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mixtral
+# copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mixtral
+# TODO @longjie no longer copied from Mistral after static cache
 class MixtralRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -226,7 +227,8 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# TODO @longjie no longer copied from Mistral after static cache
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -268,7 +270,8 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
+# copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
+# TODO @longjie no longer copied from Mistral after static cache
 class MixtralAttention(nn.Module):
     """
     Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
@@ -392,7 +395,8 @@ class MixtralAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
+# copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
+# TODO @longjie no longer copied from Mistral after static cache
 class MixtralFlashAttention2(MixtralAttention):
     """
     Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays
@@ -679,7 +683,8 @@ class MixtralFlashAttention2(MixtralAttention):
         )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Mixtral
+# copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Mixtral
+# TODO @longjie no longer copied from Mistral after static cache
 class MixtralSdpaAttention(MixtralAttention):
     """
     Mixtral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -958,7 +963,7 @@ MIXTRAL_START_DOCSTRING = r"""
     "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
     MIXTRAL_START_DOCSTRING,
 )
-# Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel with Qwen2->Mixtral
 class MixtralPreTrainedModel(PreTrainedModel):
     config_class = MixtralConfig
     base_model_prefix = "model"
@@ -1052,7 +1057,8 @@ MIXTRAL_INPUTS_DOCSTRING = r"""
     "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
     MIXTRAL_START_DOCSTRING,
 )
-# Copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
+# copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
+# TODO @longjie no longer copied from Mistral after static cache
 class MixtralModel(MixtralPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index 75ba7163ba..ea2bd074ee 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -45,7 +45,7 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "PersimmonConfig"
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Persimmon
+#  Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Persimmon
 class PersimmonRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -137,7 +137,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 1f82b09a25..1436138f91 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -76,7 +76,7 @@ def _get_unpad_data(attention_mask):
     )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Phi
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Phi
 class PhiRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -168,7 +168,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index f90d61d6e6..98a9e27c9b 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -94,7 +94,7 @@ class Qwen2RMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2
 class Qwen2RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -139,7 +139,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -611,7 +611,7 @@ class Qwen2FlashAttention2(Qwen2Attention):
         )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralSdpaAttention with Mixtral->Qwen2
 class Qwen2SdpaAttention(Qwen2Attention):
     """
     Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
index 21d8d608ee..0e4b4b75e8 100644
--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -170,7 +170,7 @@ class Qwen2MoeRMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2Moe
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Qwen2Moe
 class Qwen2MoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -215,7 +215,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -689,7 +689,7 @@ class Qwen2MoeFlashAttention2(Qwen2MoeAttention):
         )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Qwen2Moe
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralSdpaAttention with Mixtral->Qwen2Moe
 class Qwen2MoeSdpaAttention(Qwen2MoeAttention):
     """
     Qwen2Moe attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index e8d07340d3..5b81abc693 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -71,7 +71,7 @@ def _get_unpad_data(attention_mask):
     )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->StableLm
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->StableLm
 class StableLmRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -163,7 +163,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index d808a3fd25..e0abef4b41 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -74,7 +74,7 @@ def _get_unpad_data(attention_mask):
     )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Starcoder2
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralRotaryEmbedding with Mixtral->Starcoder2
 class Starcoder2RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -119,7 +119,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
+# Copied from transformers.models.mixtral.modeling_mixtral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
@@ -590,7 +590,7 @@ class Starcoder2FlashAttention2(Starcoder2Attention):
         )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralSdpaAttention with Mistral->Starcoder2
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralSdpaAttention with Mixtral->Starcoder2
 class Starcoder2SdpaAttention(Starcoder2Attention):
     """
     Starcoder2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
@@ -703,7 +703,7 @@ class Starcoder2DecoderLayer(nn.Module):
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
 
-    # Copied from transformers.models.mistral.modeling_mistral.MistralDecoderLayer.forward
+    # Copied from transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer.forward
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -780,7 +780,7 @@ STARCODER2_START_DOCSTRING = r"""
     "The bare Starcoder2 Model outputting raw hidden-states without any specific head on top.",
     STARCODER2_START_DOCSTRING,
 )
-# Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Starcoder2
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2PreTrainedModel with Qwen2->Starcoder2
 class Starcoder2PreTrainedModel(PreTrainedModel):
     config_class = Starcoder2Config
     base_model_prefix = "model"
@@ -1057,7 +1057,7 @@ class Starcoder2Model(Starcoder2PreTrainedModel):
         )
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralForCausalLM with MISTRAL->STARCODER2,Mistral-7B-v0.1->starcoder2-7b_16k,Mistral->Starcoder2,mistralai->bigcode
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM with QWEN2->STARCODER2,Qwen2->Starcoder2
 class Starcoder2ForCausalLM(Starcoder2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
 
@@ -1090,6 +1090,7 @@ class Starcoder2ForCausalLM(Starcoder2PreTrainedModel):
 
     @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    # Ignore copy
     def forward(
         self,
         input_ids: torch.LongTensor = None,
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 10dcb3bf4d..13bdf83c5b 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -12,14 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Mistral model. """
-
+"""Testing suite for the PyTorch Mistral model."""
 
 import gc
 import tempfile
 import unittest
 
 import pytest
+from packaging import version
 
 from transformers import AutoTokenizer, MistralConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
@@ -648,6 +648,74 @@ class MistralIntegrationTest(unittest.TestCase):
         backend_empty_cache(torch_device)
         gc.collect()
 
+    @slow
+    def test_compile_static_cache(self):
+        # `torch==2.2` will throw an error on this test (as in other compilation tests), but torch==2.1.2 and torch>2.2
+        # work as intended. See https://github.com/pytorch/pytorch/issues/121943
+        if version.parse(torch.__version__) < version.parse("2.3.0"):
+            self.skipTest("This test requires torch >= 2.3 to run.")
+
+        NUM_TOKENS_TO_GENERATE = 40
+        EXPECTED_TEXT_COMPLETION = {
+            8: [
+                "My favourite condiment is 100% ketchup. I love it on everything. "
+                "I’m not a big fan of mustard, mayo, or relish. I’m not a fan of pickles"
+            ],
+            7: [
+                "My favourite condiment is 100% ketchup. I love it on everything. "
+                "I’m not a big fan of mustard, mayo, or relish. I’m not a fan of pickles"
+            ],
+        }
+
+        prompts = ["My favourite condiment is "]
+        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
+        tokenizer.pad_token = tokenizer.eos_token
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1", device_map="sequential", torch_dtype=torch.float16
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+
+        # Dynamic Cache
+        generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False)
+        dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], dynamic_text)
+
+        # Static Cache
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_text)
+
+        # Sliding Window Cache
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="sliding_window"
+        )
+        static_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_text)
+
+        # Static Cache + compile
+        forward_function = model.forward
+        model.forward = torch.compile(forward_function, mode="reduce-overhead", fullgraph=True)
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="static"
+        )
+        static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text)
+
+        # Sliding Window Cache + compile
+        torch._dynamo.reset()
+        model.forward = torch.compile(forward_function, mode="reduce-overhead", fullgraph=True)
+        generated_ids = model.generate(
+            **inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False, cache_implementation="sliding_window"
+        )
+        static_compiled_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], static_compiled_text)
+
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
+
 
 @slow
 @require_torch_gpu

From 0df888ffb72ea370555efdef45985378d3cc7b2b Mon Sep 17 00:00:00 2001
From: Aaron Jimenez <aaronjimv@gmail.com>
Date: Mon, 20 May 2024 15:48:52 -0800
Subject: [PATCH 15/80] [docs] Spanish translation of model_memory_anatomy.md
 (#30885)

* add model_memory_anatomy to es/_toctree.yml

* copy model_memory_anatomy.md to es/

* translate first section

* translate doc

* chage forward activations

* fix sentence and and link to Trainer

* fix Trainer link
---
 docs/source/es/_toctree.yml            |   2 +
 docs/source/es/model_memory_anatomy.md | 239 +++++++++++++++++++++++++
 2 files changed, 241 insertions(+)
 create mode 100644 docs/source/es/model_memory_anatomy.md

diff --git a/docs/source/es/_toctree.yml b/docs/source/es/_toctree.yml
index cf1ae39c03..5a20aca2e5 100644
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@@ -102,4 +102,6 @@
     title: Perplejidad de los modelos de longitud fija
   - local: pipeline_webserver
     title: Flujo de trabajo para la inferencia de los servidores web
+  - local: model_memory_anatomy
+    title: Anatomía del entrenamiento de los modelos
   title: Guías conceptuales
diff --git a/docs/source/es/model_memory_anatomy.md b/docs/source/es/model_memory_anatomy.md
new file mode 100644
index 0000000000..6a220da993
--- /dev/null
+++ b/docs/source/es/model_memory_anatomy.md
@@ -0,0 +1,239 @@
+<!---
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Anatomía del entrenamiento de los modelos
+
+Para entender las técnicas de optimización del rendimiento que se pueden aplicar para mejorar la eficiencia en la velocidad del entrenamiento de los modelos y la utilización de la memoria, es útil familiarizarse con cómo se utiliza la GPU durante el entrenamiento y cómo varía la intensidad de cálculo según la operación realizada.
+
+Empecemos explorando un ejemplo enfocado en la utilización de la GPU y la ejecución del entrenamiento de un modelo. Para la demostración, necesitaremos instalar algunas bibliotecas:
+
+```bash
+pip install transformers datasets accelerate nvidia-ml-py3
+```
+
+La biblioteca `nvidia-ml-py3` nos permite monitorear la utilización de memoria de los modelos desde Python. Es posible que estés familiarizado con el comando `nvidia-smi` en la terminal, esta biblioteca nos permite acceder a la misma información en Python directamente.
+
+Luego, creamos algunos datos ficticios: IDs de tokens aleatorios entre 100 y 30000 y etiquetas binarias para un clasificador. En total, obtenemos 512 secuencias cada una con longitud 512 y las almacenamos en un [`~datasets.Dataset`] con formato PyTorch.
+
+
+```py
+>>> import numpy as np
+>>> from datasets import Dataset
+
+
+>>> seq_len, dataset_size = 512, 512
+>>> dummy_data = {
+...     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
+...     "labels": np.random.randint(0, 1, (dataset_size)),
+... }
+>>> ds = Dataset.from_dict(dummy_data)
+>>> ds.set_format("pt")
+```
+
+Para imprimir estadísticas resumidas para la utilización de la GPU y la ejecución del entrenamiento con [`Trainer`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.Trainer), definimos dos funciones auxiliares:
+
+```py
+>>> from pynvml import *
+
+
+>>> def print_gpu_utilization():
+...     nvmlInit()
+...     handle = nvmlDeviceGetHandleByIndex(0)
+...     info = nvmlDeviceGetMemoryInfo(handle)
+...     print(f"GPU memory occupied: {info.used//1024**2} MB.")
+
+
+>>> def print_summary(result):
+...     print(f"Time: {result.metrics['train_runtime']:.2f}")
+...     print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+...     print_gpu_utilization()
+```
+
+Comencemos comprobando que la memoria GPU este libre:
+
+```py
+>>> print_gpu_utilization()
+GPU memory occupied: 0 MB.
+```
+
+Parece estar bien: la memoria de la GPU no está ocupada como esperaríamos antes de cargar cualquier modelo. Si no es el caso en tu máquina, asegúrate de detener todos los procesos que estén utilizando la memoria de la GPU. Sin embargo, no toda la memoria libre de la GPU puede ser utilizada por el usuario. Cuando se carga un modelo en la GPU, también se cargan los kernels, lo que puede ocupar 1-2GB de memoria. Para ver cuánta memoria será ocupada por defecto, cargemos un tensor diminuto en la GPU, lo que también desencadena la carga de los kernels.
+
+```py
+>>> import torch
+
+
+>>> torch.ones((1, 1)).to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 1343 MB.
+```
+
+Vemos que los kernels solos ocupan 1,3GB de memoria de la GPU. Ahora, veamos cuánto espacio ocupa el modelo.
+
+## Cargar el Modelo
+
+Primero, cargamos el modelo `google-bert/bert-large-uncased`. Los pesos del modelo son cargados directamente en la GPU para que podamos verificar cuánto espacio ocupan solo los pesos.
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 2631 MB.
+```
+
+Podemos ver que los pesos del modelo solos ocupan 1,3 GB de memoria de la GPU. El número exacto depende de la GPU específica que estés utilizando. Ten en cuenta que en GPUs más modernas, un modelo puede ocupar más espacio ya que los pesos se cargan de manera optimizada lo cual acelera el uso del modelo. Ahora también podemos verificar rápidamente si obtenemos el mismo resultado que con la CLI de `nvidia-smi`:
+
+```bash
+nvidia-smi
+```
+
+```bash
+Tue Jan 11 08:58:05 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
+| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
++-----------------------------------------------------------------------------+
+```
+
+Obtenemos el mismo número que antes y también puedes ver que estamos utilizando una GPU V100 con 16GB de memoria. Ahora podemos empezar a entrenar el modelo y ver cómo cambia el consumo de memoria de la GPU. Primero, configuramos algunos argumentos de entrenamiento estándar:
+
+```py
+default_args = {
+    "output_dir": "tmp",
+    "eval_strategy": "steps",
+    "num_train_epochs": 1,
+    "log_level": "error",
+    "report_to": "none",
+}
+```
+
+<Tip>
+
+Si planeas ejecutar varias pruebas, reinicie el kernel de Python entre cada prueba para borrar correctamente la memoria.
+
+</Tip>
+
+## Utilización de la memoria en el entrenamiento
+
+Vamos a utilizar el [`Trainer`](https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.Trainer) y entrenar el modelo sin utilizar ninguna técnica de optimización del rendimiento de la GPU y un tamaño de lote de 4:
+
+```py
+>>> from transformers import TrainingArguments, Trainer, logging
+
+>>> logging.set_verbosity_error()
+
+
+>>> training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
+>>> trainer = Trainer(model=model, args=training_args, train_dataset=ds)
+>>> result = trainer.train()
+>>> print_summary(result)
+```
+
+```
+Time: 57.82
+Samples/second: 8.86
+GPU memory occupied: 14949 MB.
+```
+
+Vemos que incluso un tamaño de lote relativamente pequeño casi llena toda la memoria de nuestra GPU. Sin embargo, un tamaño de lote más grande a menudo puede resultar en una convergencia del modelo más rápida o un mejor rendimiento final. Así que idealmente queremos ajustar el tamaño del lote a las necesidades del modelo y no a las limitaciones de la GPU. Lo interesante es que utilizamos mucha más memoria que el tamaño del modelo. 
+Para entender un poco mejor por qué es el caso, echemos un vistazo a las operaciones y necesidades de memoria de un modelo.
+
+## Anatomía de las Operaciones del Modelo
+
+La arquitectura de los transformers incluye 3 grupos principales de operaciones agrupadas a continuación por intensidad de cálculo.
+
+1. **Contracciones de Tensores**
+
+    Las capas lineales y componentes de la Atención Multi-Head realizan **multiplicaciones matriciales por lotes**. Estas operaciones son la parte más intensiva en cálculo del entrenamiento de los transformers.
+
+2. **Normalizaciones Estadísticas**
+
+    Softmax y normalización de capas son menos intensivas en cálculo que las contracciones de tensores, e implican una o más **operaciones de reducción**, cuyo resultado se aplica luego mediante un mapa.
+
+3. **Operadores por Elemento**
+
+    Estos son los operadores restantes: **sesgos, dropout, activaciones y conexiones residuales**. Estas son las operaciones menos intensivas en cálculo.
+
+Este conocimiento puede ser útil al analizar cuellos de botella de rendimiento.
+
+Este resumen se deriva de [Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020](https://arxiv.org/abs/2007.00072)
+
+
+## Anatomía de la Memoria del Modelo
+
+Hemos visto que al entrenar un modelo se utiliza mucha más memoria que solo poner el modelo en la GPU. Esto se debe a que hay muchos componentes durante el entrenamiento que utilizan memoria de la GPU. Los componentes en memoria de la GPU son los siguientes:
+
+1. pesos del modelo
+2. estados del optimizador
+3. gradientes
+4. activaciones hacia adelante guardadas para el cálculo del gradiente
+5. buffers temporales
+6. memoria específica de funcionalidad
+
+Un modelo típico entrenado en precisión mixta con AdamW requiere 18 bytes por parámetro del modelo más memoria de activación. Para la inferencia no hay estados del optimizador ni gradientes, por lo que podemos restarlos. Y así terminamos con 6 bytes por parámetro del modelo para la inferencia en precisión mixta, más la memoria de activación.
+
+Veámoslo a detalle:
+
+**Pesos del Modelo:**
+
+- 4 bytes por número de parámetros para entrenamiento en fp32
+- 6 bytes por número de parámetros para entrenamiento en precisión mixta (mantiene un modelo en fp32 y uno en fp16 en memoria)
+
+**Estados del Optimizador:**
+
+- 8 bytes por número de parámetros para un AdamW normal (mantiene 2 estados)
+- 2 bytes por número de parámetros para optimizadores de 8 bits como [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- 4 bytes por número de parámetros para optimizadores como SGD con momentum (mantiene solo 1 estado)
+
+**Gradientes**
+
+- 4 bytes por número de parámetros para entrenamiento en fp32 o precisión mixta (los gradientes siempre se mantienen en fp32)
+
+**Activaciones hacia Adelante**
+
+- El tamaño depende de muchos factores, los principales siendo la longitud de la secuencia, el tamaño oculto y el tamaño de lote.
+
+Hay entradas y salidas que se pasan y se devuelven por las funciones hacia adelante y hacia atrás, y las activaciones hacia adelante (*forward activations*) guardadas para el cálculo del gradiente.
+
+**Memoria Temporal**
+
+Además, hay todas clases de variables temporales que se liberan una vez que se completa el cálculo, pero en el momento podrían requerir memoria adicional y podrían provocar un error de memoria insuficiente. Por lo tanto, al codificar es crucial pensar estratégicamente sobre tales variables temporales y a veces liberarlas explícitamente tan pronto como ya no se necesitan.
+
+**Memoria Específica de Funcionalidad**
+
+Entonces, su software podría tener necesidades especiales de memoria. Por ejemplo, al generar texto mediante la búsqueda por haz, el software necesita mantener múltiples copias de las entradas y salidas.
+
+**Velocidad de Ejecución `forward` vs `backward`**
+
+Para convoluciones y capas lineales, hay 2x flops en la ejecución hacia atrás (`backward`) en comparación con la ejecución hacia adelante (`forward`), lo que generalmente se traduce en ~2x más lento (a veces más, porque los tamaños en la ejecución hacia atrás tienden a ser más complejos). Las activaciones suelen ser limitadas por ancho de banda, y es típico que una activación tenga que leer más datos en la ejecución hacia atrás que en la ejecución hacia adelante (por ejemplo, la activación hacia adelante lee una vez, escribe una vez, la activación hacia atrás lee dos veces, gradOutput y salida de la ejecución hacia adelante, y escribe una vez, gradInput).
+
+Como puedes ver, hay potencialmente unos pocos lugares donde podríamos ahorrar memoria de la GPU o acelerar operaciones. Ahora que entiendes qué afecta la utilización de la GPU y la velocidad de cálculo, consulta la página de documentación [Métodos y herramientas para entrenamiento eficiente en una sola GPU](https://huggingface.co/docs/transformers/perf_train_gpu_one) para aprender sobre técnicas de optimización del rendimiento.

From c876d12127272ce2886ce51399928c3343d475b9 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 21 May 2024 09:14:14 +0200
Subject: [PATCH 16/80] FIX / TST: Fix expected results on Mistral slow test
 (A10) (#30909)

Update test_modeling_mistral.py
---
 tests/models/mistral/test_modeling_mistral.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 13bdf83c5b..dfa0646f8f 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -627,7 +627,7 @@ class MistralIntegrationTest(unittest.TestCase):
     def test_speculative_generation(self):
         EXPECTED_TEXT_COMPLETION = {
             7: "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs",
-            8: "My favourite condiment is 100% Sriracha. I love the heat, the sweetness, the tang",
+            8: "My favourite condiment is 100% ketchup. I love it on everything. I’m not a big",
         }
         prompt = "My favourite condiment is "
         tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)

From 8871b261500b3b81b08f58972edbfba8dd78e58c Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 21 May 2024 10:16:37 +0200
Subject: [PATCH 17/80] FEAT / Trainer: LOMO optimizer support (#30178)

* add V1 - adalomo not working yet

* add todo docs + refactor from comments

* adjust LR

* add docs

* add more elaborated test

* Apply suggestions from code review

Co-authored-by: Zach Mueller <muellerzr@gmail.com>

* fix

* push

* add accelerate check

* fix DDP case

* Apply suggestions from code review

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix

* init kwargs

* safely add attribute

* revert to enum logic

* Update src/transformers/trainer.py

---------

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/trainer.md              | 50 ++++++++++++++++++++++++++
 src/transformers/testing_utils.py      |  9 +++++
 src/transformers/trainer.py            | 42 +++++++++++++++++++---
 src/transformers/training_args.py      |  2 ++
 src/transformers/utils/__init__.py     |  1 +
 src/transformers/utils/import_utils.py |  5 +++
 tests/trainer/test_trainer.py          | 44 +++++++++++++++++++++++
 7 files changed, 149 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md
index b69bebd6ea..b71f42aa14 100644
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@@ -382,6 +382,56 @@ trainer.train()
 
 Note layerwise optimization is a bit experimental and does not support DDP (Distributed Data Parallel), thus you can run the training script only on a single GPU. Please see [this appropriate section](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) for more details. Other features such as gradient clipping, DeepSpeed, etc might not be supported out of the box. Please [raise an issue on GitHub](https://github.com/huggingface/transformers/issues) if you encounter such issue.
 
+## LOMO optimizer
+
+The LOMO optimizers have been introduced in [Full Parameter Fine-Tuning for Large Language Models with Limited Resources](https://hf.co/papers/2306.09782) and [AdaLomo: Low-memory Optimization with Adaptive Learning Rate](https://hf.co/papers/2310.10195). 
+They both consist of an efficient full-parameter fine-tuning method. These optimizers fuse the gradient computation and the parameter update in one step to reduce memory usage. Supported optimizers for LOMO are `"lomo"` and `"adalomo"`. First either install LOMO from pypi `pip install lomo-optim` or install it from source with `pip install git+https://github.com/OpenLMLab/LOMO.git`. 
+
+<Tip>
+
+According to the authors, it is recommended to use `AdaLomo` without `grad_norm` to get better performance and higher throughput.
+
+</Tip>
+
+Below is a simple script to demonstrate how to fine-tune [google/gemma-2b](https://huggingface.co/google/gemma-2b) on IMDB dataset in full precision:
+
+```python
+import torch
+import datasets
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
+import trl
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-lomo",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    optim="adalomo",
+    gradient_checkpointing=True,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-6,
+    save_strategy="no",
+    run_name="lomo-imdb",
+)
+
+model_id = "google/gemma-2b"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=1024,
+)
+
+trainer.train()
+```
+
 ## Accelerate and Trainer
 
 The [`Trainer`] class is powered by [Accelerate](https://hf.co/docs/accelerate), a library for easily training PyTorch models in distributed environments with support for integrations such as [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/).
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index e17c5d295f..0abd804a36 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -82,6 +82,7 @@ from .utils import (
     is_keras_nlp_available,
     is_levenshtein_available,
     is_librosa_available,
+    is_lomo_available,
     is_natten_available,
     is_nltk_available,
     is_onnx_available,
@@ -338,6 +339,14 @@ def require_galore_torch(test_case):
     return unittest.skipUnless(is_galore_torch_available(), "test requires GaLore")(test_case)
 
 
+def require_lomo(test_case):
+    """
+    Decorator marking a test that requires LOMO. These tests are skipped when LOMO-optim isn't installed.
+    https://github.com/OpenLMLab/LOMO
+    """
+    return unittest.skipUnless(is_lomo_available(), "test requires LOMO")(test_case)
+
+
 def require_cv2(test_case):
     """
     Decorator marking a test that requires OpenCV.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index afe414fbc8..20c709b248 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -153,6 +153,7 @@ from .utils import (
     is_galore_torch_available,
     is_in_notebook,
     is_ipex_available,
+    is_lomo_available,
     is_peft_available,
     is_safetensors_available,
     is_sagemaker_dp_enabled,
@@ -1059,12 +1060,18 @@ class Trainer:
             if "params" in optimizer_kwargs:
                 optimizer_grouped_parameters = optimizer_kwargs.pop("params")
 
+            # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs`
+            # e.g. for LOMO optimizer.
+            if "model" in optimizer_kwargs:
+                optimizer_grouped_parameters = optimizer_kwargs.pop("model")
+
             # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
             # to avoid arguments conflicts.
             if "optimizer_dict" in optimizer_kwargs:
                 optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict")
 
             self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+
             if optimizer_cls.__name__ == "Adam8bit":
                 import bitsandbytes
 
@@ -1382,6 +1389,26 @@ class Trainer:
 
             if args.optim == OptimizerNames.GALORE_ADAFACTOR:
                 optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})
+        elif args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            if not is_lomo_available():
+                raise ImportError(
+                    "You need to install `lomo_optim` in order to use LOMO optimizers"
+                    " install it with `pip install lomo-optim`"
+                )
+            if not is_accelerate_available("0.30.0"):
+                raise ImportError("You need to have `accelerate>=0.30.0` to be able to use LOMO optimizers")
+
+            if model is None:
+                raise ValueError("You need to pass a `model` in order to correctly initialize a LOMO optimizer.")
+
+            from lomo_optim import AdaLomo, Lomo
+
+            if "ada" in args.optim:
+                optimizer_cls = AdaLomo
+            else:
+                optimizer_cls = Lomo
+
+            optimizer_kwargs.update({"model": model})
         else:
             raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
         return optimizer_cls, optimizer_kwargs
@@ -2045,6 +2072,9 @@ class Trainer:
                 model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
                     self.model, self.optimizer, self.lr_scheduler
                 )
+        elif self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            # In this case we are in DDP + LOMO, which should be supported
+            self.optimizer = self.accelerator.prepare(self.optimizer)
 
         if self.is_fsdp_enabled:
             self.model = self.model_wrapped = model
@@ -2143,7 +2173,6 @@ class Trainer:
         self._globalstep_last_logged = self.state.global_step
         model.zero_grad()
         grad_norm: Optional[float] = None
-
         self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
 
         total_batched_samples = 0
@@ -2275,8 +2304,8 @@ class Trainer:
                         else:
                             grad_norm = _grad_norm
 
-                    # Optimizer step
                     self.optimizer.step()
+
                     optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
                     if optimizer_was_run:
                         # Delay optimizer scheduling until metrics are generated
@@ -3229,7 +3258,6 @@ class Trainer:
         """
         model.train()
         inputs = self._prepare_inputs(inputs)
-
         if is_sagemaker_mp_enabled():
             loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
             return loss_mb.reduce_mean().detach().to(self.args.device)
@@ -3240,6 +3268,12 @@ class Trainer:
         del inputs
         torch.cuda.empty_cache()
 
+        kwargs = {}
+
+        # For LOMO optimizers you need to explicitly use the learnign rate
+        if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:
+            kwargs["learning_rate"] = self._get_learning_rate()
+
         if self.args.n_gpu > 1:
             loss = loss.mean()  # mean() to average on multi-gpu parallel training
 
@@ -3247,7 +3281,7 @@ class Trainer:
             with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                 scaled_loss.backward()
         else:
-            self.accelerator.backward(loss)
+            self.accelerator.backward(loss, **kwargs)
 
         return loss.detach() / self.args.gradient_accumulation_steps
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 08d9000cb6..24bb3c7d02 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -171,6 +171,8 @@ class OptimizerNames(ExplicitEnum):
     GALORE_ADAMW_LAYERWISE = "galore_adamw_layerwise"
     GALORE_ADAMW_8BIT_LAYERWISE = "galore_adamw_8bit_layerwise"
     GALORE_ADAFACTOR_LAYERWISE = "galore_adafactor_layerwise"
+    LOMO = "lomo"
+    ADALOMO = "adalomo"
 
 
 # Sometimes users will pass in a `str` repr of a dict in the CLI
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 82d0b5001c..9f9d5ceb8d 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -141,6 +141,7 @@ from .import_utils import (
     is_keras_nlp_available,
     is_levenshtein_available,
     is_librosa_available,
+    is_lomo_available,
     is_mlx_available,
     is_natten_available,
     is_ninja_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index b941080b97..e58754db0d 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -99,6 +99,7 @@ _av_available = importlib.util.find_spec("av") is not None
 _bitsandbytes_available = _is_package_available("bitsandbytes")
 _eetq_available = _is_package_available("eetq")
 _galore_torch_available = _is_package_available("galore_torch")
+_lomo_available = _is_package_available("lomo_optim")
 # `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
 _bs4_available = importlib.util.find_spec("bs4") is not None
 _coloredlogs_available = _is_package_available("coloredlogs")
@@ -328,6 +329,10 @@ def is_galore_torch_available():
     return _galore_torch_available
 
 
+def is_lomo_available():
+    return _lomo_available
+
+
 def is_pyctcdecode_available():
     return _pyctcdecode_available
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 1711f600ce..2d76264341 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -63,6 +63,7 @@ from transformers.testing_utils import (
     require_deepspeed,
     require_galore_torch,
     require_intel_extension_for_pytorch,
+    require_lomo,
     require_optuna,
     require_peft,
     require_ray,
@@ -1229,6 +1230,49 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             trainer.train()
             trainer.evaluate()
 
+    @require_lomo
+    @require_torch_gpu
+    def test_lomo(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+
+        previous_params = {n: p.clone() for n, p in tiny_llama.named_parameters()}
+
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Trainer without inf/nan filter
+            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20)
+            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+            # Check this works
+            _ = trainer.train()
+
+        for name, param in tiny_llama.named_parameters():
+            self.assertFalse(torch.allclose(param, previous_params[name].to(param.device), rtol=1e-12, atol=1e-12))
+
+    @require_lomo
+    @require_torch_gpu
+    def test_adalomo(self):
+        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
+        tiny_llama = LlamaForCausalLM(config)
+        x = torch.randint(0, 100, (128,))
+        train_dataset = RepeatDataset(x)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Trainer without inf/nan filter
+            args = TrainingArguments(
+                tmpdir,
+                learning_rate=1e-9,
+                logging_steps=5,
+                optim="adalomo",
+            )
+            trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
+
+            # Check this works
+            _ = trainer.train()
+
     def test_galore_matched_modules(self):
         regex_patterns = [r".*.attn.*", r".*.mlp.*"]
 

From d502bd64756535ff6af43cbc5a15aa5da7f52483 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 21 May 2024 09:41:29 +0100
Subject: [PATCH 18/80] Bump requests from 2.31.0 to 2.32.0 in
 /examples/research_projects/decision_transformer (#30925)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../research_projects/decision_transformer/requirements.txt     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt
index fc23114851..0ae3469949 100644
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@@ -177,7 +177,7 @@ PyYAML==6.0
 ray>2.6.3
 redis==4.5.4
 regex==2022.3.15
-requests==2.31.0
+requests==2.32.0
 requests-oauthlib==1.3.1
 resampy==0.2.2
 responses==0.18.0

From a755745546779ae5c42510bc02a859bdac82b3b7 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 21 May 2024 17:43:22 +0800
Subject: [PATCH 19/80] PaliGemma - fix processor with no input text (#30916)

Update processing_paligemma.py
---
 src/transformers/models/paligemma/processing_paligemma.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 76a9aaf705..258954d856 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -177,6 +177,7 @@ class PaliGemmaProcessor(ProcessorMixin):
             logger.warning_once(
                 "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
             )
+            text = ""
 
         if isinstance(text, List) and isinstance(images, List):
             if len(images) < len(text):

From 7a4792e6b3feba1cb379478449483754a70257ec Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 21 May 2024 17:16:07 +0530
Subject: [PATCH 20/80] CI:  AMD MI300 tests fix (#30797)

* add fix

* update import

* updated dicts and comments

* remove prints

* Update testing_utils.py
---
 src/transformers/testing_utils.py             |  9 ++++
 tests/models/bark/test_modeling_bark.py       |  7 ++-
 tests/models/gemma/test_modeling_gemma.py     | 45 +++++++++++++++++++
 tests/models/kosmos2/test_modeling_kosmos2.py |  8 ++--
 tests/models/llama/test_modeling_llama.py     | 17 ++++++-
 tests/models/mistral/test_modeling_mistral.py | 11 ++++-
 tests/models/mixtral/test_modeling_mixtral.py | 21 +++++++++
 .../perceiver/test_modeling_perceiver.py      | 12 ++++-
 tests/trainer/test_trainer.py                 |  5 ++-
 tests/trainer/test_trainer_seq2seq.py         |  1 -
 10 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 0abd804a36..4c6940564e 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -166,6 +166,15 @@ ENDPOINT_STAGING = "https://hub-ci.huggingface.co"
 # Not critical, only usable on the sandboxed CI instance.
 TOKEN = "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL"
 
+if is_torch_available():
+    import torch
+
+    IS_ROCM_SYSTEM = torch.version.hip is not None
+    IS_CUDA_SYSTEM = torch.version.cuda is not None
+else:
+    IS_ROCM_SYSTEM = False
+    IS_CUDA_SYSTEM = False
+
 
 def parse_flag_from_env(key, default=False):
     try:
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index 476031068f..bfc6a4dadf 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -1327,4 +1327,9 @@ class BarkModelIntegrationTests(unittest.TestCase):
             output_with_offload = self.model.generate(**input_ids, do_sample=False, temperature=1.0)
 
         # checks if same output
-        self.assertListEqual(output_with_no_offload.tolist(), output_with_offload.tolist())
+        self.assertListAlmostEqual(output_with_no_offload.squeeze().tolist(), output_with_offload.squeeze().tolist())
+
+    def assertListAlmostEqual(self, list1, list2, tol=1e-6):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index 8281f59993..fa0db9ee78 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -601,6 +601,11 @@ class GemmaIntegrationTest(unittest.TestCase):
     @require_read_token
     def test_model_2b_bf16(self):
         model_id = "google/gemma-2b"
+
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXTS = {
             7: [
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music",
@@ -610,6 +615,10 @@ class GemmaIntegrationTest(unittest.TestCase):
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music",
                 "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
             ],
+            9: [
+                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
+            ],
         }
 
         model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
@@ -627,6 +636,11 @@ class GemmaIntegrationTest(unittest.TestCase):
     @require_read_token
     def test_model_2b_eager(self):
         model_id = "google/gemma-2b"
+
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXTS = {
             7: [
                 "Hello I am doing a project on the 1990s and I am looking for some information on the ",
@@ -636,6 +650,10 @@ class GemmaIntegrationTest(unittest.TestCase):
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music",
                 "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
             ],
+            9: [
+                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
+            ],
         }
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -655,6 +673,11 @@ class GemmaIntegrationTest(unittest.TestCase):
     @require_read_token
     def test_model_2b_sdpa(self):
         model_id = "google/gemma-2b"
+
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXTS = {
             7: [
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music",
@@ -664,6 +687,10 @@ class GemmaIntegrationTest(unittest.TestCase):
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music",
                 "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
             ],
+            9: [
+                "Hello I am doing a project on the 1990s and I need to know what the most popular music",
+                "Hi today I am going to share with you a very easy and simple recipe of <strong><em>Kaju Kat",
+            ],
         }
 
         model = AutoModelForCausalLM.from_pretrained(
@@ -763,6 +790,11 @@ class GemmaIntegrationTest(unittest.TestCase):
     @require_read_token
     def test_model_7b_bf16(self):
         model_id = "google/gemma-7b"
+
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXTS = {
             7: [
                 """Hello I am doing a project on a 1991 240sx and I am trying to find""",
@@ -772,6 +804,10 @@ class GemmaIntegrationTest(unittest.TestCase):
                 "Hello I am doing a project for my school and I am trying to make a program that will read a .txt file",
                 "Hi today I am going to show you how to make a very simple and easy to make a very simple and",
             ],
+            9: [
+                "Hello I am doing a project for my school and I am trying to get a servo to move a certain amount of degrees",
+                "Hi today I am going to show you how to make a very simple and easy to make DIY light up sign",
+            ],
         }
 
         model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to(
@@ -845,6 +881,11 @@ class GemmaIntegrationTest(unittest.TestCase):
         NUM_TOKENS_TO_GENERATE = 40
         # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
         # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
+        #
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXT_COMPLETION = {
             8: [
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
@@ -854,6 +895,10 @@ class GemmaIntegrationTest(unittest.TestCase):
                 "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
                 "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
             ],
+            9: [
+                "Hello I am doing a project on the 1990s and I need to know what the most popular music was in the 1990s. I have looked on the internet and I have found",
+                "Hi today\nI have a problem with my 2007 1.9 tdi 105bhp.\nI have a problem with the engine management light on.\nI have checked the",
+            ],
         }
 
         prompts = ["Hello I am doing", "Hi today"]
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 9bc95b8bd4..ca944d0df0 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -26,7 +26,7 @@ import requests
 
 from transformers import AutoModelForVision2Seq, AutoProcessor, Kosmos2Config
 from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -562,6 +562,8 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
         processed_text = processed_text[0]
         final_text, entities = final_text_with_entities[0]
 
+        atol = 1e-4 if IS_ROCM_SYSTEM else 1e-5
+
         np.testing.assert_allclose(
             torch.concat(scores[1:4])[:3, :3].to("cpu").numpy(),
             np.array(
@@ -571,7 +573,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
                     [-0.9352350831031799, -4.688288688659668, 6.240612983703613],
                 ]
             ),
-            atol=1e-5,
+            atol=atol,
         )
         np.testing.assert_allclose(
             torch.concat(scores[-3:])[-3:, -3:].to("cpu").numpy(),
@@ -629,7 +631,7 @@ class Kosmos2ModelIntegrationTest(unittest.TestCase):
                     [-0.7624598741531372, -4.771658897399902, 6.576295852661133],
                 ]
             ),
-            atol=1e-5,
+            atol=atol,
         )
         np.testing.assert_allclose(
             torch.concat(scores[-3:])[-3:, -3:].to("cpu").numpy(),
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 58269d62e0..61b33b3ec9 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -715,6 +715,11 @@ class LlamaIntegrationTest(unittest.TestCase):
         NUM_TOKENS_TO_GENERATE = 40
         # Note on `EXPECTED_TEXT_COMPLETION`'s diff: the current value matches the original test if the original test
         # was changed to have a cache of 53 tokens (as opposed to 4096), on Ampere GPUs.
+        #
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXT_COMPLETION = {
             8: [
                 "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial "
@@ -730,7 +735,15 @@ class LlamaIntegrationTest(unittest.TestCase):
                 "My favorite all time favorite condiment is ketchup. I love it on hamburgers, hot dogs, fries, eggs, "
                 "and even on a good old fashioned cheeseburger. I love it on everything. I love it so",
             ],
+            9: [
+                "Simply put, the theory of relativity states that 1) the speed of light is constant in all inertial"
+                " reference frames, and 2) the laws of physics are the same for all inertial reference frames.\nThe "
+                "theory of relativ",
+                "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs,"
+                " my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p",
+            ],
         }
+        expected_text_completion_idx = 8
 
         prompts = [
             "Simply put, the theory of relativity states that ",
@@ -745,7 +758,9 @@ class LlamaIntegrationTest(unittest.TestCase):
         # Dynamic Cache
         generated_ids = model.generate(**inputs, max_new_tokens=NUM_TOKENS_TO_GENERATE, do_sample=False)
         dynamic_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION[8], dynamic_text)  # Both GPU architectures have the same output
+        self.assertEqual(
+            EXPECTED_TEXT_COMPLETION[expected_text_completion_idx], dynamic_text
+        )  # Both GPU architectures have the same output
 
         # Static Cache
         generated_ids = model.generate(
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index dfa0646f8f..cead890586 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -520,12 +520,16 @@ class MistralIntegrationTest(unittest.TestCase):
         EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]])
         torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
 
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in output.
         EXPECTED_SLICE = {
             7: torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787, 1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781]),
             8: torch.tensor([-5.8711, -5.8555, -0.1050, -4.7148, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -1.0781, 1.7568, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711]),
+            9: torch.tensor([-5.8750, -5.8594, -0.1047, -4.7188, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -1.0781,  1.7578, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750, -5.8750]),
         }  # fmt: skip
 
-        print(out[0, 0, :30])
         torch.testing.assert_close(
             out[0, 0, :30], EXPECTED_SLICE[self.cuda_compute_capability_major_version], atol=1e-4, rtol=1e-4
         )
@@ -625,9 +629,14 @@ class MistralIntegrationTest(unittest.TestCase):
 
     @slow
     def test_speculative_generation(self):
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_TEXT_COMPLETION = {
             7: "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs",
             8: "My favourite condiment is 100% ketchup. I love it on everything. I’m not a big",
+            9: "My favourite condiment is 100% ketchup. I love it on everything. I’m not a big",
         }
         prompt = "My favourite condiment is "
         tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index 0972207fce..b60062004a 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -553,6 +553,10 @@ class MixtralIntegrationTest(unittest.TestCase):
         )
         # TODO: might need to tweak it in case the logits do not match on our daily runners
         # these logits have been obtained with the original megablocks impelmentation.
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in output.
         EXPECTED_LOGITS = {
             7: torch.Tensor([[0.1670, 0.1620, 0.6094], [-0.8906, -0.1588, -0.6060], [0.1572, 0.1290, 0.7246]]).to(
                 torch_device
@@ -560,6 +564,9 @@ class MixtralIntegrationTest(unittest.TestCase):
             8: torch.Tensor([[0.1631, 0.1621, 0.6094], [-0.8906, -0.1621, -0.6094], [0.1572, 0.1270, 0.7227]]).to(
                 torch_device
             ),
+            9: torch.Tensor([[0.1641, 0.1621, 0.6094], [-0.8906, -0.1631, -0.6094], [0.1572, 0.1260, 0.7227]]).to(
+                torch_device
+            ),
         }
         with torch.no_grad():
             logits = model(dummy_input).logits
@@ -583,6 +590,11 @@ class MixtralIntegrationTest(unittest.TestCase):
         )
 
         # TODO: might need to tweak it in case the logits do not match on our daily runners
+        #
+        # Key 9 for MI300, Key 8 for A100/A10, and Key 7 for T4.
+        #
+        # Note: Key 9 is currently set for MI300, but may need potential future adjustments for H100s,
+        # considering differences in hardware processing and potential deviations in generated text.
         EXPECTED_LOGITS_LEFT = {
             7: torch.Tensor(
                 [[0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007]],
@@ -590,6 +602,9 @@ class MixtralIntegrationTest(unittest.TestCase):
             8: torch.Tensor([[0.1914, 0.0508, 0.7188], [0.1953, 0.0510, 0.7227], [0.1973, 0.0562, 0.7148]]).to(
                 torch_device
             ),
+            9: torch.Tensor([[0.1904, 0.0513, 0.7227], [0.1943, 0.0518, 0.7227], [0.1982, 0.0557, 0.7148]]).to(
+                torch_device
+            ),
         }
 
         EXPECTED_LOGITS_LEFT_UNPADDED = {
@@ -599,6 +614,9 @@ class MixtralIntegrationTest(unittest.TestCase):
             8: torch.Tensor([[0.2217, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7109, 0.2461]]).to(
                 torch_device
             ),
+            9: torch.Tensor([[0.2236, 0.5195, -0.3828], [0.8203, -0.2285, 0.6055], [0.2637, -0.7109, 0.2451]]).to(
+                torch_device
+            ),
         }
 
         EXPECTED_LOGITS_RIGHT_UNPADDED = {
@@ -608,6 +626,9 @@ class MixtralIntegrationTest(unittest.TestCase):
             8: torch.Tensor([[0.2178, 0.1260, -0.1621], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]]).to(
                 torch_device
             ),
+            9: torch.Tensor([[0.2197, 0.1250, -0.1611], [-0.3516, 0.3008, -1.0312], [0.0684, 0.7930, 0.8008]]).to(
+                torch_device
+            ),
         }
 
         with torch.no_grad():
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index fbd237bc10..86bbc7b49d 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -26,7 +26,14 @@ import numpy as np
 from datasets import load_dataset
 
 from transformers import PerceiverConfig
-from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
+from transformers.testing_utils import (
+    IS_ROCM_SYSTEM,
+    require_torch,
+    require_torch_multi_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -930,7 +937,8 @@ class PerceiverModelIntegrationTest(unittest.TestCase):
 
         expected_slice = torch.tensor([-1.1652, -0.1992, -0.7520], device=torch_device)
 
-        self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=1e-4))
+        atol = 1e-3 if IS_ROCM_SYSTEM else 1e-4
+        self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=atol))
 
     @slow
     def test_inference_image_classification_fourier(self):
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 2d76264341..fc7c3dc783 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -2551,7 +2551,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
         )
         eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
 
-        training_args = TrainingArguments(output_dir="./examples", use_cpu=True)
+        training_args = TrainingArguments(output_dir="./examples", use_cpu=True, report_to="none")
         trainer = Trainer(model=model, args=training_args, eval_dataset=eval_dataset)
         result = trainer.evaluate()
         self.assertLess(result["eval_loss"], 0.2)
@@ -2572,6 +2572,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             output_dir="./examples",
             use_cpu=True,
             per_device_eval_batch_size=1,
+            report_to="none",
         )
         trainer = Trainer(
             model=model,
@@ -3107,6 +3108,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 "--predict_with_generate",
                 "--ddp_timeout",
                 "60",
+                "--report_to",
+                "none",
             ]
             execute_subprocess_async(command)
             # successful return here == success - any errors would have caused an error or a timeout in the sub-call
diff --git a/tests/trainer/test_trainer_seq2seq.py b/tests/trainer/test_trainer_seq2seq.py
index 61d2163b9e..17593e2e73 100644
--- a/tests/trainer/test_trainer_seq2seq.py
+++ b/tests/trainer/test_trainer_seq2seq.py
@@ -161,7 +161,6 @@ class Seq2seqTrainerTester(TestCasePlus):
             tokenizer=tokenizer,
             data_collator=data_collator,
             compute_metrics=lambda x: {"samples": x[0].shape[0]},
-            report_to="none",
         )
 
         def prepare_data(examples):

From daf281f44f654abd5e7ed07e37985cee2250c3af Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 21 May 2024 07:50:11 -0400
Subject: [PATCH 21/80] Enforce saving at end of training if saving option
 chosen (#30160)

* Enforce saving at end of training

* Fix test

* Rework test

* Fixup tests'

* Update comment based on sourab feedback

* Clean
---
 src/transformers/trainer_callback.py   |  3 ++
 src/transformers/training_args.py      |  3 ++
 tests/trainer/test_trainer.py          | 51 ++++++++++++++++++++++++++
 tests/trainer/test_trainer_callback.py |  2 +-
 4 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index a21c46fea9..d2570ed8ba 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -544,6 +544,9 @@ class DefaultFlowCallback(TrainerCallback):
         # End training
         if state.global_step >= state.max_steps:
             control.should_training_stop = True
+            # Save the model at the end if we have a save strategy
+            if args.save_strategy != IntervalStrategy.NO:
+                control.should_save = True
 
         return control
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 24bb3c7d02..60a81f7454 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -335,6 +335,9 @@ class TrainingArguments:
                 - `"no"`: No save is done during training.
                 - `"epoch"`: Save is done at the end of each epoch.
                 - `"steps"`: Save is done every `save_steps`.
+
+                If `"epoch"` or `"steps"` is chosen, saving will also be performed at the
+                very end of training, always.
         save_steps (`int` or `float`, *optional*, defaults to 500):
             Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
             float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index fc7c3dc783..4d3fc57340 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -129,6 +129,7 @@ if is_torch_available():
     if is_safetensors_available():
         import safetensors.torch
 
+
 # for version specific tests in TrainerIntegrationTest
 require_accelerate_version_min_0_28 = partial(require_accelerate, min_version="0.28")
 GRAD_ACCUM_KWARGS_VERSION_AVAILABLE = is_accelerate_available("0.28")
@@ -2016,6 +2017,56 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                     tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
                 )
 
+    def test_load_best_model_with_save(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                save_steps=5,
+                evaluation_strategy="steps",
+                eval_steps=5,
+                max_steps=9,
+            )
+            trainer.train()
+            # Check that we have the last known step:
+            assert os.path.exists(
+                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
+            ), f"Could not find checkpoint-{trainer.state.max_steps}"
+            # And then check the last step
+            assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
+
+        # Now test that using a limit works
+        # Should result in:
+        # - save at step 5 (but is deleted)
+        # - save at step 10 (loaded in at the end when `load_best_model=True`)
+        # - save at step 11
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(
+                output_dir=tmpdir,
+                save_steps=5,
+                evaluation_strategy="steps",
+                eval_steps=5,
+                load_best_model_at_end=True,
+                save_total_limit=2,
+                max_steps=11,
+            )
+            trainer.train()
+            # Check that we have the last known step:
+            assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
+            # And then check the last multiple
+            assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
+            # Finally check that we don't have an old one
+            assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
+
+            # Finally check that the right model was loaded in, checkpoint-10
+            # this goes by the last `eval` step check to do so, so it won't be
+            # the last model *saved*
+            model_state = trainer.model.state_dict()
+            final_model_weights = safetensors.torch.load_file(
+                os.path.join(tmpdir, "checkpoint-10", "model.safetensors")
+            )
+            for k, v in model_state.items():
+                assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
+
     @require_torch_multi_accelerator
     def test_run_seq2seq_double_train_wrap_once(self):
         # test that we don't wrap the model more than once
diff --git a/tests/trainer/test_trainer_callback.py b/tests/trainer/test_trainer_callback.py
index 8c0c9367d8..9eeb1d5e41 100644
--- a/tests/trainer/test_trainer_callback.py
+++ b/tests/trainer/test_trainer_callback.py
@@ -153,7 +153,7 @@ class TrainerCallbackTest(unittest.TestCase):
                     expected_events.append("on_log")
                 if trainer.args.eval_strategy == IntervalStrategy.STEPS and step % trainer.args.eval_steps == 0:
                     expected_events += evaluation_events.copy()
-                if step % trainer.args.save_steps == 0:
+                if step % trainer.args.save_steps == 0 or step == trainer.state.max_steps:
                     expected_events.append("on_save")
             expected_events.append("on_epoch_end")
             if trainer.args.eval_strategy == IntervalStrategy.EPOCH:

From 3b09d3f05f95629736bf5337ab118fa3327cc8c7 Mon Sep 17 00:00:00 2001
From: Matthew Beckers <17108752+mattlbeck@users.noreply.github.com>
Date: Tue, 21 May 2024 13:56:52 +0100
Subject: [PATCH 22/80] fix: center_crop occasionally outputs off-by-one
 dimension matrix (#30934)

If required padding for a crop larger than input image is odd-numbered,
the padding would be rounded down instead of rounded up, causing the
output dimension to be one smaller than it should be.
---
 src/transformers/image_transforms.py | 5 +++--
 tests/test_image_transforms.py       | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
index 016fae4405..65d6413db7 100644
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import warnings
+from math import ceil
 from typing import Iterable, List, Optional, Tuple, Union
 
 import numpy as np
@@ -483,9 +484,9 @@ def center_crop(
     new_image = np.zeros_like(image, shape=new_shape)
 
     # If the image is too small, pad it with zeros
-    top_pad = (new_height - orig_height) // 2
+    top_pad = ceil((new_height - orig_height) / 2)
     bottom_pad = top_pad + orig_height
-    left_pad = (new_width - orig_width) // 2
+    left_pad = ceil((new_width - orig_width) / 2)
     right_pad = left_pad + orig_width
     new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
 
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index ae86f84def..25775d787e 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -369,6 +369,10 @@ class ImageTransformsTester(unittest.TestCase):
         self.assertEqual(cropped_image.shape, (300, 260, 3))
         self.assertTrue(np.allclose(cropped_image, expected_image))
 
+        # Test that odd numbered padding requirement still leads to correct output dimensions
+        cropped_image = center_crop(image, (300, 259), data_format="channels_last")
+        self.assertEqual(cropped_image.shape, (300, 259, 3))
+
         # Test image with 4 channels is cropped correctly
         image = np.random.randint(0, 256, (224, 224, 4))
         expected_image = image[52:172, 82:142, :]

From 64e0573a8133db833521db9f23d05e36fc06c5f3 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 21 May 2024 15:15:19 +0200
Subject: [PATCH 23/80] [Benchmark] Reuse `optimum-benchmark` (#30615)

* benchmark

* update

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 Makefile                                      |   7 +-
 benchmark/__init__.py                         |   0
 benchmark/benchmark.py                        | 310 ++++++++++++++++++
 benchmark/config/generation.yaml              |  57 ++++
 benchmark/optimum_benchmark_wrapper.py        |  16 +
 setup.py                                      |   3 +
 src/transformers/dependency_versions_table.py |   1 +
 7 files changed, 393 insertions(+), 1 deletion(-)
 create mode 100644 benchmark/__init__.py
 create mode 100644 benchmark/benchmark.py
 create mode 100644 benchmark/config/generation.yaml
 create mode 100644 benchmark/optimum_benchmark_wrapper.py

diff --git a/Makefile b/Makefile
index ebc66d922c..6c1541a113 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples
+.PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples benchmark
 
 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src
@@ -96,6 +96,11 @@ test:
 test-examples:
 	python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/
 
+# Run benchmark
+
+benchmark:
+	python3 benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=diff backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun
+
 # Run tests for SageMaker DLC release
 
 test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py
new file mode 100644
index 0000000000..9e38c1f70a
--- /dev/null
+++ b/benchmark/benchmark.py
@@ -0,0 +1,310 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Run benchmark using the `optimum-benchmark` library with some customization in `transformers`.
+
+Assume we are under `transformers` root directory: (make sure the commits are valid commits)
+```bash
+python benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=9b9c7f03da625b13643e99205c691fe046461724 --metrics=decode.latency.mean,per_token.latency.mean,per_token.throughput.value backend.model=google/gemma-2b benchmark.input_shapes.sequence_length=5,7 benchmark.input_shapes.batch_size=1,2 --multirun
+```
+"""
+
+import argparse
+import glob
+import json
+import os.path
+import re
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+
+from git import Repo
+
+from optimum_benchmark import Benchmark
+from optimum_benchmark_wrapper import main
+
+
+PATH_TO_REPO = Path(__file__).parent.parent.resolve()
+
+
+@contextmanager
+def checkout_commit(repo: Repo, commit_id: str):
+    """
+    Context manager that checks out a given commit when entered, but gets back to the reference it was at on exit.
+    Args:
+        repo (`git.Repo`): A git repository (for instance the Transformers repo).
+        commit_id (`str`): The commit reference to checkout inside the context manager.
+    """
+    current_head = repo.head.commit if repo.head.is_detached else repo.head.ref
+
+    try:
+        repo.git.checkout(commit_id)
+        yield
+
+    finally:
+        repo.git.checkout(current_head)
+
+
+def summarize(run_dir, metrics, expand_metrics=False):
+    """Produce a summary for each optimum-benchmark launched job's output directory found in `run_dir`.
+
+    Each summary's format is as follows (for `expand_metrics=False`):
+    ```
+    {
+        "model": "google/gemma-2b",
+        "commit": "3cd6ed22e4d49219f300f5055e71e3929aba20d7",
+        "config": "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5",
+        "metrics": {
+            "decode.latency.mean": 1.624666809082031,
+            "per_token.latency.mean": 0.012843788806628804,
+            "per_token.throughput.value": 77.85864553330948
+        }
+    }
+    ```
+    """
+    reports = glob.glob(os.path.join(run_dir, "**/benchmark_report.json"), recursive=True)
+    report_dirs = [str(Path(report).parent) for report in reports]
+
+    summaries = []
+    for report_dir in report_dirs:
+        commit = re.search(r"/commit=([^/]+)", report_dir).groups()[0]
+
+        if not os.path.isfile(os.path.join(report_dir, "benchmark.json")):
+            continue
+        benchmark = Benchmark.from_json(os.path.join(report_dir, "benchmark.json"))
+        report = benchmark.report
+
+        model = benchmark.config.backend["model"]
+
+        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
+        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
+        benchmark_name = str(Path(benchmark_name).parts[-1])
+        if benchmark_name.startswith("commit="):
+            benchmark_name = benchmark.config.name
+
+        metrics_values = {}
+        # post-processing of report: show a few selected/important metric
+        for metric in metrics:
+            keys = metric.split(".")
+            value = report
+            current = metrics_values
+            for key in keys:
+                # Avoid KeyError when a user's specified metric has typo.
+                # TODO: Give warnings.
+                if key not in value:
+                    continue
+                value = value[key]
+
+                if expand_metrics:
+                    if isinstance(value, dict):
+                        if key not in current:
+                            current[key] = {}
+                            current = current[key]
+                    else:
+                        current[key] = value
+
+            if not expand_metrics:
+                metrics_values[metric] = value
+
+        # show some config information
+        print(f"model: {model}")
+        print(f"commit: {commit}")
+        print(f"config: {benchmark_name}")
+        if len(metrics_values) > 0:
+            print("metrics:")
+            if expand_metrics:
+                print(metrics_values)
+            else:
+                for metric, value in metrics_values.items():
+                    print(f"  - {metric}: {value}")
+        print("-" * 80)
+
+        summary = {
+            "model": model,
+            "commit": commit,
+            "config": benchmark_name,
+            "metrics": metrics_values,
+        }
+        summaries.append(summary)
+
+        with open(os.path.join(report_dir, "summary.json"), "w") as fp:
+            json.dump(summary, fp, indent=4)
+
+    # TODO: upload to Hub
+    return summaries
+
+
+def combine_summaries(summaries):
+    """Combine a list of summary obtained from the function `summarize`.
+
+    The combined summary's format is as follows:
+    ```
+    "google/gemma-2b": {
+        "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5": {
+            "3cd6ed22e4d49219f300f5055e71e3929aba20d7": {
+                "metrics": {"decode.latency.mean": 1.624666809082031}
+            },
+            "c97ee28b117c0abe8e08891f402065e4df6d72aa": {
+                "metrics": {"decode.latency.mean": 1.6278163452148438}
+            }
+        },
+        "benchmark.input_shapes.batch_size=2,benchmark.input_shapes.sequence_length=5": {
+            "3cd6ed22e4d49219f300f5055e71e3929aba20d7": {
+                "metrics": {"decode.latency.mean": 1.6947791748046876}
+            },
+            "c97ee28b117c0abe8e08891f402065e4df6d72aa": {
+                "metrics": {
+                    "decode.latency.mean": 1.6980519409179688}
+            }
+        }
+    }
+    ```
+    """
+    combined = {}
+    for summary in summaries:
+        model = summary["model"]
+        config = summary["config"]
+        commit = summary["commit"]
+
+        if model not in combined:
+            combined[model] = {}
+
+        if config not in combined[model]:
+            combined[model][config] = {}
+
+        if commit not in combined[model][config]:
+            combined[model][config][commit] = {"metrics": summary["metrics"]}
+
+    with open(os.path.join(exp_run_dir, "summary.json"), "w") as fp:
+        json.dump(combined, fp, indent=4)
+
+    # TODO: upload to Hub
+    print(json.dumps(combined, indent=4))
+
+    return combined
+
+
+if __name__ == "__main__":
+
+    def list_str(values):
+        return values.split(",")
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.")
+    parser.add_argument("--config-name", type=str, required=True, help="The config name.")
+
+    # arguments specific to this wrapper for our own customization
+    parser.add_argument("--ensure_empty", type=bool, default=True, help="If to create a temporary directory.")
+    parser.add_argument(
+        "--commit",
+        type=list_str,
+        default="",
+        help="Comma-separated list of branch names and/or commit sha values on which the benchmark will run. If `diff` is specified, it will run on both the current head and the `main` branch.",
+    )
+    parser.add_argument("--metrics", type=str, help="The metrics to be included in the summary.")
+    args, optimum_benchmark_args = parser.parse_known_args()
+
+    repo = Repo(PATH_TO_REPO)
+
+    metrics = [
+        "prefill.latency.mean",
+        "prefill.throughput.value",
+        "decode.latency.mean",
+        "decode.throughput.value",
+        "per_token.latency.mean",
+        "per_token.throughput.value",
+    ]
+    if args.metrics is not None:
+        metrics = args.metrics.split(",")
+
+    # Get `backend.model` in a hacky way: We want to control the experiment flow manually.
+    models = [""]
+    for idx, arg in enumerate(optimum_benchmark_args):
+        if arg.startswith("backend.model="):
+            models = arg[len("backend.model=") :]
+            models = models.split(",")
+            break
+    optimum_benchmark_args = [arg for arg in optimum_benchmark_args if not arg.startswith("backend.model=")]
+
+    # Get the commit(s)
+    current_head = str(repo.head.commit) if repo.head.is_detached else str(repo.head.ref)
+    commits = [x for x in args.commit if x != ""]
+    if len(commits) == 0:
+        commits = [current_head]
+    elif len(commits) == 1 and commits[0] == "diff":
+        # compare to `main`
+        commits = ["main", current_head]
+
+    # Get the specified run directory
+    run_dir_arg_idx, run_dir = -1, None
+    sweep_dir_arg_idx, sweep_dir = -1, None
+    for idx, arg in enumerate(optimum_benchmark_args):
+        if arg.startswith("hydra.run.dir="):
+            run_dir = arg[len("hydra.run.dir=") :]
+            run_dir_arg_idx = idx
+        elif arg.startswith("hydra.sweep.dir="):
+            sweep_dir = arg[len("hydra.sweep.dir=") :]
+            sweep_dir_arg_idx = idx
+    exp_run_dir, arg_dix, arg_name = (
+        (sweep_dir, sweep_dir_arg_idx, "hydra.sweep.dir")
+        if "--multirun" in optimum_benchmark_args
+        else (run_dir, run_dir_arg_idx, "hydra.run.dir")
+    )
+
+    # TODO: not hardcoded
+    if exp_run_dir is None and args.ensure_empty:
+        exp_run_dir = "_benchmark"
+
+    if args.ensure_empty:
+        os.makedirs(exp_run_dir, exist_ok=True)
+        exp_run_dir = tempfile.mkdtemp(dir=exp_run_dir)
+
+    run_summaries = []
+    for commit in commits:
+        with checkout_commit(repo, commit):
+            commit = str(repo.head.commit)
+
+            commit_run_dir = exp_run_dir
+            if exp_run_dir is not None:
+                commit_run_dir = os.path.join(exp_run_dir, rf"commit\={commit}")
+
+            print(f"Run benchmark on commit: {commit}")
+
+            for model in models:
+                model_arg = [f"backend.model={model}"] if model != "" else []
+                dir_args = []
+                if commit_run_dir is not None:
+                    if arg_dix > -1:
+                        optimum_benchmark_args[arg_dix] = f"{arg_name}={commit_run_dir}"
+                    else:
+                        dir_args = [
+                            f"hydra.sweep.dir={commit_run_dir}",
+                            f"hydra.run.dir={commit_run_dir}/" + "${hydra.job.override_dirname}",
+                        ]
+                main(args.config_dir, args.config_name, model_arg + dir_args + optimum_benchmark_args)
+
+            if commit_run_dir is not None:
+                # Need to remove the `\` character
+                summaries = summarize(commit_run_dir.replace("\\", ""), metrics)
+                run_summaries.extend(summaries)
+
+    # aggregate the information across the commits
+    if exp_run_dir is not None:
+        with open(os.path.join(exp_run_dir, "summaries.json"), "w") as fp:
+            json.dump(run_summaries, fp, indent=4)
+
+        combined_summary = combine_summaries(run_summaries)
diff --git a/benchmark/config/generation.yaml b/benchmark/config/generation.yaml
new file mode 100644
index 0000000000..44a3f9ea49
--- /dev/null
+++ b/benchmark/config/generation.yaml
@@ -0,0 +1,57 @@
+defaults:
+  - benchmark # inheriting benchmark schema
+  - scenario: inference
+  - launcher: process
+  - backend: pytorch
+  - _self_ # for hydra 1.1 compatibility
+
+name: pytorch_generate
+
+launcher:
+  start_method: spawn
+  device_isolation: true
+  device_isolation_action: warn
+
+backend:
+  device: cuda
+  device_ids: 0
+  no_weights: true
+  model: meta-llama/Llama-2-7b-hf
+  cache_implementation: static
+  torch_compile: true
+  torch_dtype: float16
+  torch_compile_config:
+    backend: inductor
+    mode: reduce-overhead
+    fullgraph: true
+
+scenario:
+  input_shapes:
+    batch_size: 1
+    sequence_length: 7
+  generate_kwargs:
+    max_new_tokens: 128
+    min_new_tokens: 128
+    do_sample: false
+  memory: true
+  latency: true
+  iterations: 2
+  duration: 0
+
+
+# hydra/cli specific settings
+hydra:
+  run:
+    # where to store run results
+    dir: runs/${name}
+  job:
+    # change working directory to the run directory
+    chdir: true
+    env_set:
+      # set environment variable OVERRIDE_BENCHMARKS to 1
+      # to not skip benchmarks that have been run before
+      OVERRIDE_BENCHMARKS: 1
+      LOG_LEVEL: WARN
+  sweep:
+    dir: multirun
+    subdir: ${hydra.job.override_dirname}
\ No newline at end of file
diff --git a/benchmark/optimum_benchmark_wrapper.py b/benchmark/optimum_benchmark_wrapper.py
new file mode 100644
index 0000000000..c43e9a73e3
--- /dev/null
+++ b/benchmark/optimum_benchmark_wrapper.py
@@ -0,0 +1,16 @@
+import argparse
+import subprocess
+
+
+def main(config_dir, config_name, args):
+    subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.")
+    parser.add_argument("--config-name", type=str, required=True, help="The config name.")
+    args, unknown = parser.parse_known_args()
+
+    main(args.config_dir, args.config_name, unknown)
diff --git a/setup.py b/setup.py
index dfd0b61cbe..efdb5342d0 100644
--- a/setup.py
+++ b/setup.py
@@ -136,6 +136,7 @@ _deps = [
     "onnxruntime-tools>=1.4.2",
     "onnxruntime>=1.4.0",
     "opencv-python",
+    "optimum-benchmark>=0.2.0",
     "optuna",
     "optax>=0.0.8,<=0.1.4",
     "packaging>=20.0",
@@ -410,6 +411,8 @@ extras["agents"] = deps_list(
     "diffusers", "accelerate", "datasets", "torch", "sentencepiece", "opencv-python", "Pillow"
 )
 
+extras["benchmark"] = deps_list("optimum-benchmark")
+
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
 install_requires = [
     deps["filelock"],  # filesystem locks, e.g., to prevent parallel downloads
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index e4c9fbf0d9..ceca99d450 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -42,6 +42,7 @@ deps = {
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
     "onnxruntime": "onnxruntime>=1.4.0",
     "opencv-python": "opencv-python",
+    "optimum-benchmark": "optimum-benchmark>=0.2.0",
     "optuna": "optuna",
     "optax": "optax>=0.0.8,<=0.1.4",
     "packaging": "packaging>=20.0",

From eae2b6b89e72dbee531e5b96acc94b98161aa8a4 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Tue, 21 May 2024 15:54:41 +0200
Subject: [PATCH 24/80] TST / Workflows: Get slack notifications for docker
 image build (#30891)

* Get slack notifications for docker image build

* Apply suggestions from code review

* Apply suggestions from code review
---
 .github/workflows/build-docker-images.yml | 83 ++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index 7c9e86d091..f113579691 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -57,6 +57,15 @@ jobs:
           push: true
           tags: huggingface/transformers-all-latest-gpu-push-ci
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   latest-torch-deepspeed-docker:
     name: "Latest PyTorch + DeepSpeed"
     runs-on: [intel-cpu, 8-cpu, ci]
@@ -93,6 +102,15 @@ jobs:
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
   latest-torch-deepspeed-docker-for-push-ci-daily-build:
     name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
@@ -134,6 +152,15 @@ jobs:
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   doc-builder:
     name: "Doc builder"
     # Push CI doesn't need this image
@@ -160,6 +187,15 @@ jobs:
           push: true
           tags: huggingface/transformers-doc-builder
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-doc-builder docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   latest-pytorch:
     name: "Latest PyTorch [dev]"
     # Push CI doesn't need this image
@@ -198,6 +234,15 @@ jobs:
           push: true
           tags: huggingface/transformers-pytorch-gpu
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   latest-pytorch-amd:
     name: "Latest PyTorch (AMD) [dev]"
     runs-on: [intel-cpu, 8-cpu, ci]
@@ -237,6 +282,15 @@ jobs:
           push: true
           tags: huggingface/transformers-pytorch-amd-gpu-push-ci
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   latest-tensorflow:
     name: "Latest TensorFlow [dev]"
     # Push CI doesn't need this image
@@ -265,6 +319,15 @@ jobs:
           push: true
           tags: huggingface/transformers-tensorflow-gpu
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   latest-pytorch-deepspeed-amd:
     name: "PyTorch + DeepSpeed (AMD) [dev]"
     runs-on: [intel-cpu, 8-cpu, ci]
@@ -304,6 +367,15 @@ jobs:
           push: true
           tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
 
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
   latest-quantization-torch-docker:
     name: "Latest Pytorch + Quantization [dev]"
      # Push CI doesn't need this image
@@ -330,4 +402,13 @@ jobs:
           build-args: |
             REF=main
           push: true
-          tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }}
\ No newline at end of file
+          tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }}
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-quantization-latest-gpu build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

From d24097e0229485287ff4959258c552168bd898c6 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Tue, 21 May 2024 15:40:19 +0100
Subject: [PATCH 25/80] Fix swin embeddings interpolation (#30936)

---
 .../models/donut/modeling_donut_swin.py       | 19 +++----------------
 .../maskformer/modeling_maskformer_swin.py    | 19 +++----------------
 src/transformers/models/swin/modeling_swin.py | 17 ++---------------
 .../models/swinv2/modeling_swinv2.py          | 17 ++---------------
 4 files changed, 10 insertions(+), 62 deletions(-)

diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index c1b27cd180..5b8a0e27b0 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -205,9 +205,7 @@ class DonutSwinEmbeddings(nn.Module):
         interpolate_pos_encoding: bool = False,
     ) -> Tuple[torch.Tensor]:
         _, num_channels, height, width = pixel_values.shape
-        embeddings, output_dimensions = self.patch_embeddings(
-            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
-        )
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
         embeddings = self.norm(embeddings)
         batch_size, seq_len, _ = embeddings.size()
 
@@ -228,7 +226,7 @@ class DonutSwinEmbeddings(nn.Module):
         return embeddings, output_dimensions
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings
+# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->DonutSwin
 class DonutSwinPatchEmbeddings(nn.Module):
     """
     This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
@@ -260,21 +258,10 @@ class DonutSwinPatchEmbeddings(nn.Module):
             pixel_values = nn.functional.pad(pixel_values, pad_values)
         return pixel_values
 
-    def forward(
-        self, pixel_values: Optional[torch.FloatTensor], interpolate_pos_encoding: bool = False
-    ) -> Tuple[torch.Tensor, Tuple[int]]:
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
         _, num_channels, height, width = pixel_values.shape
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-            )
         # pad the input to be divisible by self.patch_size, if needed
         pixel_values = self.maybe_pad(pixel_values, height, width)
-        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
-            )
         embeddings = self.projection(pixel_values)
         _, _, height, width = embeddings.shape
         output_dimensions = (height, width)
diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py
index fc9c642adc..ef607ec811 100644
--- a/src/transformers/models/maskformer/modeling_maskformer_swin.py
+++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py
@@ -197,9 +197,7 @@ class MaskFormerSwinEmbeddings(nn.Module):
 
     def forward(self, pixel_values, interpolate_pos_encoding):
         _, num_channels, height, width = pixel_values.shape
-        embeddings, output_dimensions = self.patch_embeddings(
-            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
-        )
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
         embeddings = self.norm(embeddings)
 
         if self.position_embeddings is not None:
@@ -213,7 +211,7 @@ class MaskFormerSwinEmbeddings(nn.Module):
         return embeddings, output_dimensions
 
 
-# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings
+# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->MaskFormerSwin
 class MaskFormerSwinPatchEmbeddings(nn.Module):
     """
     This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
@@ -245,21 +243,10 @@ class MaskFormerSwinPatchEmbeddings(nn.Module):
             pixel_values = nn.functional.pad(pixel_values, pad_values)
         return pixel_values
 
-    def forward(
-        self, pixel_values: Optional[torch.FloatTensor], interpolate_pos_encoding: bool = False
-    ) -> Tuple[torch.Tensor, Tuple[int]]:
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
         _, num_channels, height, width = pixel_values.shape
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-            )
         # pad the input to be divisible by self.patch_size, if needed
         pixel_values = self.maybe_pad(pixel_values, height, width)
-        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
-            )
         embeddings = self.projection(pixel_values)
         _, _, height, width = embeddings.shape
         output_dimensions = (height, width)
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 2a6363c8e6..9f64f8009a 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -291,9 +291,7 @@ class SwinEmbeddings(nn.Module):
         interpolate_pos_encoding: bool = False,
     ) -> Tuple[torch.Tensor]:
         _, num_channels, height, width = pixel_values.shape
-        embeddings, output_dimensions = self.patch_embeddings(
-            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
-        )
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
         embeddings = self.norm(embeddings)
         batch_size, seq_len, _ = embeddings.size()
 
@@ -345,21 +343,10 @@ class SwinPatchEmbeddings(nn.Module):
             pixel_values = nn.functional.pad(pixel_values, pad_values)
         return pixel_values
 
-    def forward(
-        self, pixel_values: Optional[torch.FloatTensor], interpolate_pos_encoding: bool = False
-    ) -> Tuple[torch.Tensor, Tuple[int]]:
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
         _, num_channels, height, width = pixel_values.shape
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-            )
         # pad the input to be divisible by self.patch_size, if needed
         pixel_values = self.maybe_pad(pixel_values, height, width)
-        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
-            )
         embeddings = self.projection(pixel_values)
         _, _, height, width = embeddings.shape
         output_dimensions = (height, width)
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index ac8ec197e5..48a4c65722 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -334,9 +334,7 @@ class Swinv2Embeddings(nn.Module):
         interpolate_pos_encoding: bool = False,
     ) -> Tuple[torch.Tensor]:
         _, num_channels, height, width = pixel_values.shape
-        embeddings, output_dimensions = self.patch_embeddings(
-            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
-        )
+        embeddings, output_dimensions = self.patch_embeddings(pixel_values)
         embeddings = self.norm(embeddings)
         batch_size, seq_len, _ = embeddings.size()
 
@@ -389,21 +387,10 @@ class Swinv2PatchEmbeddings(nn.Module):
             pixel_values = nn.functional.pad(pixel_values, pad_values)
         return pixel_values
 
-    def forward(
-        self, pixel_values: Optional[torch.FloatTensor], interpolate_pos_encoding: bool = False
-    ) -> Tuple[torch.Tensor, Tuple[int]]:
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
         _, num_channels, height, width = pixel_values.shape
-        if num_channels != self.num_channels:
-            raise ValueError(
-                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
-            )
         # pad the input to be divisible by self.patch_size, if needed
         pixel_values = self.maybe_pad(pixel_values, height, width)
-        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model"
-                f" ({self.image_size[0]}*{self.image_size[1]})."
-            )
         embeddings = self.projection(pixel_values)
         _, _, height, width = embeddings.shape
         output_dimensions = (height, width)

From 5bf9caa06df47acda751ff5bcef95b937b86c71f Mon Sep 17 00:00:00 2001
From: Lu Teng <teng.lu@intel.com>
Date: Wed, 22 May 2024 01:14:11 +0800
Subject: [PATCH 26/80] Fix inhomogeneous shape error in example (#30434)

Fix inhomogeneous shape error in example.
---
 examples/flax/question-answering/run_qa.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index 7c705090f1..16a744ddc3 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -426,7 +426,8 @@ def eval_data_collator(dataset: Dataset, batch_size: int):
 
     for idx in batch_idx:
         batch = dataset[idx]
-        batch = {k: np.array(v) for k, v in batch.items()}
+        # Ignore `offset_mapping` to avoid numpy/JAX array conversion issue.
+        batch = {k: np.array(v) for k, v in batch.items() if k != "offset_mapping"}
 
         yield batch
 
@@ -1000,7 +1001,6 @@ def main():
                     position=2,
                 ):
                     _ = batch.pop("example_id")
-                    _ = batch.pop("offset_mapping")
                     predictions = pad_shard_unpad(p_eval_step)(
                         state, batch, min_device_batch=per_device_eval_batch_size
                     )
@@ -1055,7 +1055,6 @@ def main():
             eval_loader, total=math.ceil(len(eval_dataset) / eval_batch_size), desc="Evaluating ...", position=2
         ):
             _ = batch.pop("example_id")
-            _ = batch.pop("offset_mapping")
             predictions = pad_shard_unpad(p_eval_step)(state, batch, min_device_batch=per_device_eval_batch_size)
             start_logits = np.array(predictions[0])
             end_logits = np.array(predictions[1])

From 60bb571e993b7d73257fb64044726b569fef9403 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Tue, 21 May 2024 19:38:02 +0200
Subject: [PATCH 27/80] =?UTF-8?q?=F0=9F=9A=A8=20[Idefics2]=20Update=20igno?=
 =?UTF-8?q?re=20index=20(#30898)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update ignore index

* Update docs

* Update docs
---
 docs/source/en/model_doc/idefics2.md          | 52 +++++++++++++++++++
 .../models/idefics2/modeling_idefics2.py      |  2 +-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md
index 31a7a1cdeb..5ad56b7b5c 100644
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@@ -87,6 +87,58 @@ generated_text = processor.batch_decode(generated_text, skip_special_tokens=True
 print("Generated text:", generated_text)
 ```
 
+- During training, it's important to determine which tokens the model should not learn. For Idefics2, this typically comes down to the image and padding tokens. This means that one can create the labels as follows:
+
+```python
+import requests
+from PIL import Image
+from transformers import Idefics2Processor, Idefics2ForConditionalGeneration
+import torch
+
+url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
+
+image_1 = Image.open(requests.get(url_1, stream=True).raw)
+image_2 = Image.open(requests.get(url_2, stream=True).raw)
+images = [image_1, image_2]
+
+messages = [{
+    "role": "user",
+    "content": [
+        {"type": "text", "text": "What’s the difference between these two images?"},
+        {"type": "image"},
+        {"type": "image"},
+    ],
+},
+{
+    "role": "assistant",
+    "content": [
+        {"type": "text", "text": "The difference is that one image is about dogs and the other one about cats."},
+    ],
+}]
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b")
+model = Idefics2ForConditionalGeneration.from_pretrained("HuggingFaceM4/idefics2-8b")
+model.to(device)
+
+text = processor.apply_chat_template(messages, add_generation_prompt=False)
+inputs = processor(images=images, text=text, return_tensors="pt").to(device)
+
+labels = inputs.input_ids.clone()
+labels[labels == processor.tokenizer.pad_token_id] = -100
+labels[labels == model.config.image_token_id] = -100
+
+inputs["labels"] = labels
+
+outputs = model(**inputs)
+loss = outputs.loss
+loss.backward()
+```
+
+Do note that when training Idefics2 on multi-turn conversations between a user and an assistant, one typically also sets all the tokens corresponding to the user messages to -100.
+
 ## Model optimizations: Flash Attention
 
 The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py
index 17ed18f6b9..6acabad063 100644
--- a/src/transformers/models/idefics2/modeling_idefics2.py
+++ b/src/transformers/models/idefics2/modeling_idefics2.py
@@ -1857,7 +1857,7 @@ class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel):
                 shift_logits = logits[..., :-1, :].contiguous()
                 shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=self.image_token_id)
+            loss_fct = CrossEntropyLoss()
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
 
         if not return_dict:

From 673440d073d5f534a6d6bedeeca94869afd8d0a7 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 22 May 2024 06:40:15 +0200
Subject: [PATCH 28/80] update ruff version (#30932)

* update ruff version

* fix research projects

* Empty

* Fix errors

---------

Co-authored-by: Lysandre <lysandre@huggingface.co>
---
 Makefile                                      |   2 +-
 .../flax/language-modeling/run_mlm_flax.py    |   1 +
 .../flax/language-modeling/run_t5_mlm_flax.py |   1 +
 examples/flax/question-answering/utils_qa.py  |   1 +
 .../flax/text-classification/run_flax_glue.py |   3 +-
 .../flax/token-classification/run_flax_ner.py |   3 +-
 examples/legacy/benchmarking/plot_csv_file.py |  12 +-
 examples/legacy/benchmarking/run_benchmark.py |   2 +-
 .../multiple_choice/run_multiple_choice.py    |   3 +-
 .../multiple_choice/utils_multiple_choice.py  |   3 +-
 .../legacy/question-answering/run_squad.py    |   3 +-
 .../question-answering/run_squad_trainer.py   |   3 +-
 examples/legacy/run_language_modeling.py      |   1 -
 examples/legacy/run_openai_gpt.py             |   1 +
 examples/legacy/run_swag.py                   |   3 +-
 examples/legacy/run_transfo_xl.py             |   9 +-
 examples/legacy/seq2seq/xla_spawn.py          |   1 -
 .../legacy/token-classification/run_ner.py    |   3 +-
 .../legacy/token-classification/utils_ner.py  |   3 +-
 .../run_image_classification_no_trainer.py    |   3 +-
 .../run_object_detection_no_trainer.py        |   2 +-
 .../pytorch/question-answering/trainer_qa.py  |   1 +
 .../question-answering/trainer_seq2seq_qa.py  |   1 +
 .../pytorch/question-answering/utils_qa.py    |   1 +
 .../run_semantic_segmentation_no_trainer.py   |   2 +-
 .../run_wav2vec2_pretraining_no_trainer.py    |   2 +-
 .../run_speech_recognition_ctc.py             |   2 +-
 .../run_speech_recognition_ctc_adapter.py     |   2 +-
 .../text-classification/run_classification.py |   2 +-
 .../pytorch/text-classification/run_glue.py   |   2 +-
 .../run_glue_no_trainer.py                    |   3 +-
 .../pytorch/text-classification/run_xnli.py   |   4 +-
 .../pytorch/text-generation/run_generation.py |   4 +-
 .../run_generation_contrastive_search.py      |   3 +-
 examples/pytorch/xla_spawn.py                 |   1 -
 .../research_projects/adversarial/run_hans.py |   2 +-
 .../pabee/modeling_pabee_albert.py            |   2 +-
 .../pabee/modeling_pabee_bert.py              |   3 +-
 .../run_glue_with_pabee.py                    |   3 +-
 .../bertabs/configuration_bertabs.py          |   3 +-
 ...ert_bertabs_original_pytorch_checkpoint.py |   2 +-
 .../bertology/run_bertology.py                |  13 +-
 .../bertology/run_prune_gpt.py                |   2 +-
 .../distillation/distiller.py                 |   5 +-
 .../distillation/grouped_batch_sampler.py     |   4 +-
 .../distillation/lm_seqs_dataset.py           |   5 +-
 .../distillation/run_squad_w_distillation.py  |   2 +-
 .../distillation/scripts/binarized_data.py    |   1 +
 .../distillation/scripts/extract.py           |   1 +
 .../scripts/extract_distilbert.py             |   1 +
 .../distillation/scripts/token_counts.py      |   1 +
 .../research_projects/distillation/train.py   |   1 +
 .../research_projects/distillation/utils.py   |   5 +-
 .../dataset-streaming/run_mlm_flax_stream.py  |   1 +
 .../lxmert/modeling_frcnn.py                  |  27 ++--
 .../lxmert/processing_image.py                |  27 ++--
 examples/research_projects/lxmert/utils.py    |  26 ++--
 .../lxmert/visualizing_image.py               |  27 ++--
 .../research_projects/mm-imdb/run_mmimdb.py   |   3 +-
 .../movement-pruning/counts_parameters.py     |   1 +
 .../emmental/configuration_bert_masked.py     |   3 +-
 .../emmental/modeling_bert_masked.py          |   1 -
 .../movement-pruning/masked_run_glue.py       |   2 +-
 .../movement-pruning/masked_run_squad.py      |   3 +-
 .../onnx/summarization/run_onnx_exporter.py   |   3 +-
 .../performer/run_mlm_performer.py            |   1 +
 .../evaluate-hf-trt-qa.py                     |   3 +-
 .../quantization-qdqbert/quant_trainer.py     |   1 +
 .../quantization-qdqbert/utils_qa.py          |   1 +
 .../rag-end2end-retriever/eval_rag.py         |   2 +-
 examples/research_projects/rag/eval_rag.py    |   2 +-
 .../run_speech_recognition_ctc_bnb.py         |   2 +-
 .../run_speech_recognition_ctc_streaming.py   |   2 +-
 .../visual_bert/modeling_frcnn.py             |  27 ++--
 .../visual_bert/processing_image.py           |  27 ++--
 .../research_projects/visual_bert/utils.py    |  26 ++--
 .../visual_bert/visualizing_image.py          |  27 ++--
 .../xtreme-s/run_xtreme_s.py                  |   2 +-
 .../tensorflow/benchmarking/plot_csv_file.py  |  12 +-
 .../benchmarking/run_benchmark_tf.py          |   2 +-
 .../tensorflow/question-answering/utils_qa.py |   1 +
 .../text-classification/run_glue.py           |   2 +-
 .../run_text_classification.py                |   2 +-
 setup.py                                      |   2 +-
 src/transformers/audio_utils.py               |   1 +
 src/transformers/benchmark/benchmark.py       |   3 +-
 src/transformers/benchmark/benchmark_tf.py    |   3 +-
 src/transformers/benchmark/benchmark_utils.py |   1 -
 src/transformers/configuration_utils.py       |   3 +-
 src/transformers/convert_graph_to_onnx.py     |   8 +-
 .../convert_pytorch_checkpoint_to_tf2.py      |   3 +-
 ...ert_slow_tokenizers_checkpoints_to_fast.py |   2 +-
 ...nvert_tf_hub_seq_to_seq_bert_to_pytorch.py |   1 -
 .../data/metrics/squad_metrics.py             |   1 -
 src/transformers/data/processors/glue.py      |   2 +-
 src/transformers/data/processors/xnli.py      |   3 +-
 src/transformers/deepspeed.py                 |   1 +
 src/transformers/dependency_versions_table.py |   2 +-
 src/transformers/dynamic_module_utils.py      |   1 +
 .../feature_extraction_sequence_utils.py      |   3 +-
 src/transformers/feature_extraction_utils.py  |   2 +-
 .../generation/configuration_utils.py         |   2 +-
 .../generation/stopping_criteria.py           |   6 +-
 src/transformers/generation/watermarking.py   |   1 -
 src/transformers/integrations/aqlm.py         |   1 -
 src/transformers/integrations/awq.py          |   1 +
 src/transformers/integrations/deepspeed.py    |   1 +
 src/transformers/integrations/ggml.py         |   1 +
 .../integrations/integration_utils.py         |   1 +
 src/transformers/modelcard.py                 |   3 +-
 .../modeling_flax_pytorch_utils.py            |   3 +-
 src/transformers/modeling_tf_pytorch_utils.py |   3 +-
 .../models/albert/configuration_albert.py     |   3 +-
 ...lbert_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/albert/modeling_tf_albert.py       |   3 +-
 .../models/albert/tokenization_albert.py      |   3 +-
 .../models/albert/tokenization_albert_fast.py |   3 +-
 .../models/align/configuration_align.py       |   2 +-
 .../models/align/modeling_align.py            |   2 +-
 .../models/align/processing_align.py          |   1 -
 .../models/altclip/configuration_altclip.py   |   3 +-
 .../models/altclip/modeling_altclip.py        |   3 +-
 .../models/altclip/processing_altclip.py      |   1 +
 ...iguration_audio_spectrogram_transformer.py |   3 +-
 ...trogram_transformer_original_to_pytorch.py |   1 -
 .../modeling_audio_spectrogram_transformer.py |   2 +-
 src/transformers/models/auto/auto_factory.py  |   1 +
 .../models/auto/configuration_auto.py         |   3 +-
 .../models/auto/feature_extraction_auto.py    |   3 +-
 .../models/auto/image_processing_auto.py      |   3 +-
 src/transformers/models/auto/modeling_auto.py |   2 +-
 .../models/auto/modeling_flax_auto.py         |   3 +-
 .../models/auto/modeling_tf_auto.py           |   3 +-
 .../models/auto/processing_auto.py            |   3 +-
 .../models/auto/tokenization_auto.py          |   2 +-
 .../autoformer/configuration_autoformer.py    |   2 +-
 .../models/autoformer/modeling_autoformer.py  |   2 +-
 .../models/bark/configuration_bark.py         |   2 +-
 .../models/bark/convert_suno_to_hf.py         |   1 +
 .../bark/generation_configuration_bark.py     |   2 +-
 src/transformers/models/bark/modeling_bark.py |   3 +-
 .../models/bark/processing_bark.py            |   1 +
 .../models/bart/configuration_bart.py         |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 src/transformers/models/bart/modeling_bart.py |   3 +-
 .../models/bart/modeling_flax_bart.py         |   2 +-
 .../models/bart/modeling_tf_bart.py           |   3 +-
 .../models/barthez/tokenization_barthez.py    |   3 +-
 .../barthez/tokenization_barthez_fast.py      |   3 +-
 .../models/bartpho/tokenization_bartpho.py    |   3 +-
 .../models/beit/configuration_beit.py         |   3 +-
 .../beit/convert_beit_unilm_to_pytorch.py     |   1 -
 src/transformers/models/beit/modeling_beit.py |   3 +-
 .../models/bert/configuration_bert.py         |   3 +-
 ...bert_original_tf2_checkpoint_to_pytorch.py |   1 +
 ..._bert_original_tf_checkpoint_to_pytorch.py |   1 -
 ...ping_original_tf2_checkpoint_to_pytorch.py |   1 +
 .../models/bert/modeling_tf_bert.py           |   3 +-
 .../models/bert/tokenization_bert.py          |   1 -
 .../configuration_bert_generation.py          |   2 +-
 .../tokenization_bert_generation.py           |   3 +-
 .../tokenization_bert_japanese.py             |   1 -
 .../models/bertweet/tokenization_bertweet.py  |   3 +-
 .../models/big_bird/configuration_big_bird.py |   3 +-
 ...gbird_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/big_bird/modeling_big_bird.py      |  11 +-
 .../models/big_bird/tokenization_big_bird.py  |   1 -
 .../big_bird/tokenization_big_bird_fast.py    |   3 +-
 .../configuration_bigbird_pegasus.py          |   2 +-
 .../modeling_bigbird_pegasus.py               |  10 +-
 .../models/biogpt/configuration_biogpt.py     |   2 +-
 .../models/biogpt/modeling_biogpt.py          |   3 +-
 .../models/biogpt/tokenization_biogpt.py      |   1 +
 .../models/bit/configuration_bit.py           |   2 +-
 .../models/bit/convert_bit_to_pytorch.py      |   1 -
 src/transformers/models/bit/modeling_bit.py   |   2 +-
 .../blenderbot/configuration_blenderbot.py    |   2 +-
 .../models/blenderbot/modeling_blenderbot.py  |   3 +-
 .../blenderbot/modeling_flax_blenderbot.py    |   2 +-
 .../blenderbot/modeling_tf_blenderbot.py      |   3 +-
 .../tokenization_blenderbot_fast.py           |   1 +
 .../configuration_blenderbot_small.py         |   2 +-
 .../modeling_blenderbot_small.py              |   3 +-
 .../modeling_flax_blenderbot_small.py         |   3 +-
 .../modeling_tf_blenderbot_small.py           |   3 +-
 .../tokenization_blenderbot_small_fast.py     |   1 +
 .../models/blip/configuration_blip.py         |   2 +-
 src/transformers/models/blip/modeling_blip.py |   2 +-
 .../models/blip/modeling_tf_blip.py           |   2 +-
 .../models/blip_2/configuration_blip_2.py     |   2 +-
 .../models/blip_2/modeling_blip_2.py          |   2 +-
 .../models/bloom/configuration_bloom.py       |   3 +-
 ...rt_bloom_original_checkpoint_to_pytorch.py |   1 -
 .../models/bloom/tokenization_bloom_fast.py   |   1 -
 .../bridgetower/configuration_bridgetower.py  |   2 +-
 .../models/bros/configuration_bros.py         |   2 +-
 src/transformers/models/bros/modeling_bros.py |   3 +-
 ..._byt5_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/byt5/tokenization_byt5.py          |   3 +-
 .../camembert/configuration_camembert.py      |   2 +-
 .../models/camembert/modeling_tf_camembert.py |   3 +-
 .../camembert/tokenization_camembert.py       |   3 +-
 .../camembert/tokenization_camembert_fast.py  |   3 +-
 .../models/canine/configuration_canine.py     |   2 +-
 ...anine_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/canine/modeling_canine.py          |   3 +-
 .../configuration_chinese_clip.py             |   2 +-
 .../chinese_clip/modeling_chinese_clip.py     |   3 +-
 .../models/clap/configuration_clap.py         |   2 +-
 .../models/clap/feature_extraction_clap.py    |   1 -
 src/transformers/models/clap/modeling_clap.py |   3 +-
 .../models/clip/configuration_clip.py         |   2 +-
 src/transformers/models/clip/modeling_clip.py |   3 +-
 .../models/clip/modeling_tf_clip.py           |   3 +-
 .../models/clip/tokenization_clip_fast.py     |   1 -
 .../models/clipseg/configuration_clipseg.py   |   2 +-
 .../models/clipseg/modeling_clipseg.py        |   2 +-
 .../models/clvp/configuration_clvp.py         |   3 +-
 src/transformers/models/clvp/modeling_clvp.py |   3 +-
 .../models/clvp/number_normalizer.py          |   1 -
 .../models/clvp/processing_clvp.py            |   1 -
 .../code_llama/tokenization_code_llama.py     |   1 +
 .../models/codegen/configuration_codegen.py   |   3 +-
 .../models/codegen/modeling_codegen.py        |   2 +-
 .../models/codegen/tokenization_codegen.py    |   1 -
 .../codegen/tokenization_codegen_fast.py      |   1 -
 .../models/cohere/configuration_cohere.py     |   2 +-
 .../configuration_conditional_detr.py         |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../modeling_conditional_detr.py              |   3 +-
 .../models/convbert/configuration_convbert.py |   2 +-
 .../models/convbert/modeling_convbert.py      | 135 +++++++++---------
 .../models/convbert/modeling_tf_convbert.py   |   3 +-
 .../models/convbert/tokenization_convbert.py  |   1 +
 .../convbert/tokenization_convbert_fast.py    |   1 +
 .../models/convnext/configuration_convnext.py |   2 +-
 .../convnext/convert_convnext_to_pytorch.py   |   1 -
 .../models/convnext/modeling_convnext.py      |   3 +-
 .../models/convnext/modeling_tf_convnext.py   |   3 +-
 .../convnextv2/configuration_convnextv2.py    |   3 +-
 .../models/convnextv2/modeling_convnextv2.py  |   3 +-
 .../convnextv2/modeling_tf_convnextv2.py      |   3 +-
 .../models/cpm/tokenization_cpm.py            |   1 +
 .../models/cpm/tokenization_cpm_fast.py       |   1 +
 .../models/cpmant/configuration_cpmant.py     |   2 +-
 .../models/cpmant/modeling_cpmant.py          |   3 +-
 .../models/cpmant/tokenization_cpmant.py      |   1 +
 .../models/ctrl/configuration_ctrl.py         |   2 +-
 src/transformers/models/ctrl/modeling_ctrl.py |   2 +-
 .../models/ctrl/modeling_tf_ctrl.py           |   2 +-
 .../models/ctrl/tokenization_ctrl.py          |   1 -
 .../models/cvt/configuration_cvt.py           |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 src/transformers/models/cvt/modeling_cvt.py   |   3 +-
 .../models/cvt/modeling_tf_cvt.py             |   3 +-
 .../data2vec/configuration_data2vec_audio.py  |   2 +-
 .../data2vec/configuration_data2vec_text.py   |   3 +-
 .../data2vec/configuration_data2vec_vision.py |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../data2vec/modeling_data2vec_audio.py       |   2 +-
 .../data2vec/modeling_data2vec_vision.py      |   3 +-
 .../data2vec/modeling_tf_data2vec_vision.py   |   3 +-
 .../models/dbrx/configuration_dbrx.py         |   2 +-
 src/transformers/models/dbrx/modeling_dbrx.py |   2 +-
 .../models/deberta/configuration_deberta.py   |   3 +-
 .../models/deberta/modeling_deberta.py        |   2 +-
 .../models/deberta/modeling_tf_deberta.py     |   3 +-
 .../models/deberta/tokenization_deberta.py    |   2 +-
 .../deberta/tokenization_deberta_fast.py      |   2 +-
 .../deberta_v2/configuration_deberta_v2.py    |   3 +-
 .../models/deberta_v2/modeling_deberta_v2.py  |   2 +-
 .../deberta_v2/modeling_tf_deberta_v2.py      |   2 +-
 .../deberta_v2/tokenization_deberta_v2.py     |   2 +-
 .../configuration_decision_transformer.py     |   2 +-
 .../modeling_decision_transformer.py          |   2 +-
 .../configuration_deformable_detr.py          |   2 +-
 .../convert_deformable_detr_to_pytorch.py     |   1 -
 .../models/deformable_detr/load_custom.py     |   3 +-
 .../modeling_deformable_detr.py               |   3 +-
 .../models/deit/configuration_deit.py         |   2 +-
 .../deit/convert_deit_timm_to_pytorch.py      |   1 -
 src/transformers/models/deit/modeling_deit.py |   3 +-
 .../models/deit/modeling_tf_deit.py           |   3 +-
 ...original_gluonnlp_checkpoint_to_pytorch.py |   1 -
 .../models/deprecated/mctct/modeling_mctct.py |   3 +-
 .../deprecated/mctct/processing_mctct.py      |   1 +
 .../deprecated/mmbt/configuration_mmbt.py     |   2 +-
 .../models/deprecated/mmbt/modeling_mmbt.py   |   1 -
 .../open_llama/configuration_open_llama.py    |   2 +-
 .../open_llama/modeling_open_llama.py         |   3 +-
 .../retribert/configuration_retribert.py      |   2 +-
 .../retribert/modeling_retribert.py           |   1 -
 .../configuration_trajectory_transformer.py   |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   2 +-
 .../modeling_trajectory_transformer.py        |   2 +-
 .../transfo_xl/configuration_transfo_xl.py    |   2 +-
 ...fo_xl_original_tf_checkpoint_to_pytorch.py |   1 -
 .../transfo_xl/modeling_tf_transfo_xl.py      |   2 +-
 .../modeling_tf_transfo_xl_utilities.py       |   3 +-
 .../transfo_xl/modeling_transfo_xl.py         |   5 +-
 .../modeling_transfo_xl_utilities.py          |   3 +-
 .../transfo_xl/tokenization_transfo_xl.py     |   3 +-
 .../deprecated/van/configuration_van.py       |   2 +-
 .../deprecated/van/convert_van_to_pytorch.py  |   1 -
 .../models/deprecated/van/modeling_van.py     |   2 +-
 .../configuration_depth_anything.py           |   2 +-
 .../convert_depth_anything_to_hf.py           |   1 -
 .../depth_anything/modeling_depth_anything.py |   3 +-
 .../models/deta/configuration_deta.py         |   3 +-
 .../deta/convert_deta_resnet_to_pytorch.py    |   1 -
 .../deta/convert_deta_swin_to_pytorch.py      |   1 -
 src/transformers/models/deta/modeling_deta.py |   3 +-
 .../models/detr/configuration_detr.py         |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../models/detr/convert_detr_to_pytorch.py    |   1 -
 src/transformers/models/detr/modeling_detr.py |   3 +-
 .../models/dinat/configuration_dinat.py       |   2 +-
 .../models/dinat/modeling_dinat.py            |   3 +-
 .../models/dinov2/configuration_dinov2.py     |   2 +-
 .../models/dinov2/convert_dinov2_to_hf.py     |   1 -
 .../models/dinov2/modeling_dinov2.py          |   3 +-
 .../distilbert/configuration_distilbert.py    |   3 +-
 .../models/distilbert/modeling_distilbert.py  |   5 +-
 .../distilbert/modeling_tf_distilbert.py      |   3 +-
 .../dit/convert_dit_unilm_to_pytorch.py       |   1 -
 .../models/donut/configuration_donut_swin.py  |   2 +-
 .../models/donut/convert_donut_to_pytorch.py  |  24 ++--
 .../models/donut/modeling_donut_swin.py       |   2 +-
 .../models/donut/processing_donut.py          |   1 +
 .../models/dpr/configuration_dpr.py           |   2 +-
 src/transformers/models/dpr/modeling_dpr.py   |   3 +-
 .../models/dpr/modeling_tf_dpr.py             |   2 +-
 .../models/dpr/tokenization_dpr.py            |   1 -
 .../models/dpr/tokenization_dpr_fast.py       |   1 -
 .../models/dpt/configuration_dpt.py           |   2 +-
 .../models/dpt/convert_dinov2_depth_to_hf.py  |   1 -
 .../models/dpt/convert_dpt_beit_to_hf.py      |   1 -
 .../dpt/convert_dpt_hybrid_to_pytorch.py      |   1 -
 .../models/dpt/convert_dpt_swinv2_to_hf.py    |   1 -
 .../models/dpt/convert_dpt_to_pytorch.py      |   1 -
 src/transformers/models/dpt/modeling_dpt.py   |   3 +-
 .../configuration_efficientformer.py          |   2 +-
 .../modeling_efficientformer.py               |   2 +-
 .../modeling_tf_efficientformer.py            |   2 +-
 .../configuration_efficientnet.py             |   2 +-
 .../efficientnet/modeling_efficientnet.py     |   3 +-
 .../models/electra/configuration_electra.py   |   2 +-
 ...ectra_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/electra/modeling_tf_electra.py     |   3 +-
 .../models/encodec/configuration_encodec.py   |   3 +-
 .../models/encodec/modeling_encodec.py        |   2 +-
 .../modeling_encoder_decoder.py               |   3 +-
 .../modeling_flax_encoder_decoder.py          |   3 +-
 .../modeling_tf_encoder_decoder.py            |   3 +-
 .../models/ernie/configuration_ernie.py       |   3 +-
 .../models/ernie/modeling_ernie.py            |   1 -
 .../models/ernie_m/configuration_ernie_m.py   |   2 +-
 .../models/ernie_m/modeling_ernie_m.py        |   3 +-
 .../models/esm/configuration_esm.py           |   2 +-
 src/transformers/models/esm/convert_esm.py    |   1 -
 src/transformers/models/esm/modeling_esm.py   |   2 +-
 .../models/esm/modeling_tf_esm.py             |   3 +-
 .../models/esm/openfold_utils/feats.py        |   6 +-
 .../models/esm/openfold_utils/protein.py      |   1 +
 .../models/esm/openfold_utils/tensor_utils.py |  12 +-
 .../models/esm/tokenization_esm.py            |   1 +
 .../configuration_fastspeech2_conformer.py    |   2 +-
 .../modeling_fastspeech2_conformer.py         |   2 +-
 .../tokenization_fastspeech2_conformer.py     |   1 +
 .../models/flaubert/configuration_flaubert.py |   3 +-
 .../models/flaubert/modeling_flaubert.py      |   2 +-
 .../models/flaubert/modeling_tf_flaubert.py   |   3 +-
 .../models/flaubert/tokenization_flaubert.py  |   1 -
 .../models/flava/configuration_flava.py       |   2 +-
 .../models/flava/modeling_flava.py            |   2 +-
 .../models/fnet/configuration_fnet.py         |   2 +-
 ...net_original_flax_checkpoint_to_pytorch.py |   1 -
 src/transformers/models/fnet/modeling_fnet.py |   2 +-
 .../models/fnet/tokenization_fnet.py          |   2 +-
 .../models/fnet/tokenization_fnet_fast.py     |   3 +-
 .../models/focalnet/configuration_focalnet.py |   2 +-
 .../models/focalnet/modeling_focalnet.py      |   3 +-
 .../models/fsmt/configuration_fsmt.py         |   3 +-
 .../models/fsmt/tokenization_fsmt.py          |   1 -
 .../models/funnel/configuration_funnel.py     |   2 +-
 ...unnel_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/funnel/modeling_funnel.py          |   2 +-
 .../models/funnel/modeling_tf_funnel.py       |   3 +-
 .../models/funnel/tokenization_funnel.py      |   2 +-
 .../models/funnel/tokenization_funnel_fast.py |   2 +-
 .../models/fuyu/configuration_fuyu.py         |   2 +-
 src/transformers/models/fuyu/modeling_fuyu.py |   3 +-
 .../models/fuyu/processing_fuyu.py            |   1 +
 .../models/gemma/configuration_gemma.py       |   2 +-
 .../models/gemma/modeling_flax_gemma.py       |   1 +
 .../models/gemma/modeling_gemma.py            |   2 +-
 .../models/gemma/tokenization_gemma.py        |   1 +
 .../models/git/convert_git_to_pytorch.py      |   1 -
 src/transformers/models/git/modeling_git.py   |   1 -
 .../models/glpn/configuration_glpn.py         |   2 +-
 .../models/glpn/convert_glpn_to_pytorch.py    |   1 -
 src/transformers/models/glpn/modeling_glpn.py |   3 +-
 .../models/gpt2/configuration_gpt2.py         |   3 +-
 ..._gpt2_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/gpt2/modeling_tf_gpt2.py           |   2 +-
 .../models/gpt2/tokenization_gpt2.py          |   1 -
 .../models/gpt2/tokenization_gpt2_fast.py     |   1 -
 .../gpt_bigcode/configuration_gpt_bigcode.py  |   2 +-
 .../gpt_bigcode/modeling_gpt_bigcode.py       |   1 +
 .../models/gpt_neo/configuration_gpt_neo.py   |   2 +-
 .../convert_gpt_neo_mesh_tf_to_pytorch.py     |   1 -
 .../models/gpt_neo/modeling_gpt_neo.py        |   3 +-
 .../models/gpt_neox/configuration_gpt_neox.py |   2 +-
 .../models/gpt_neox/modeling_gpt_neox.py      |   2 +-
 .../gpt_neox/tokenization_gpt_neox_fast.py    |   1 +
 .../configuration_gpt_neox_japanese.py        |   2 +-
 .../modeling_gpt_neox_japanese.py             |   2 +-
 .../tokenization_gpt_neox_japanese.py         |   1 +
 .../gpt_sw3/convert_megatron_to_pytorch.py    |   2 +-
 .../models/gptj/configuration_gptj.py         |   3 +-
 src/transformers/models/gptj/modeling_gptj.py |   2 +-
 .../models/gptj/modeling_tf_gptj.py           |   2 +-
 .../configuration_gptsan_japanese.py          |   3 +-
 .../modeling_gptsan_japanese.py               |   3 +-
 .../tokenization_gptsan_japanese.py           |   1 +
 .../graphormer/configuration_graphormer.py    |   2 +-
 .../models/graphormer/modeling_graphormer.py  |   2 +-
 .../configuration_grounding_dino.py           |   2 +-
 .../grounding_dino/modeling_grounding_dino.py |   2 +-
 .../models/groupvit/configuration_groupvit.py |   2 +-
 .../models/groupvit/modeling_groupvit.py      |   3 +-
 .../models/groupvit/modeling_tf_groupvit.py   |   3 +-
 .../models/hubert/configuration_hubert.py     |   2 +-
 ...rt_original_s3prl_checkpoint_to_pytorch.py |   1 -
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 ...rt_original_s3prl_checkpoint_to_pytorch.py |   1 -
 .../models/hubert/modeling_hubert.py          |   2 +-
 .../models/hubert/modeling_tf_hubert.py       |   2 +-
 .../models/ibert/configuration_ibert.py       |   3 +-
 .../models/idefics/configuration_idefics.py   |   2 +-
 .../models/idefics/modeling_idefics.py        |   3 +-
 .../models/idefics/modeling_tf_idefics.py     |   2 +-
 src/transformers/models/idefics/perceiver.py  |   1 +
 .../models/idefics/perceiver_tf.py            |   1 +
 src/transformers/models/idefics/vision.py     |   3 +-
 src/transformers/models/idefics/vision_tf.py  |   3 +-
 .../models/imagegpt/configuration_imagegpt.py |   2 +-
 ...onvert_imagegpt_original_tf2_to_pytorch.py |   1 -
 .../models/informer/modeling_informer.py      |   2 +-
 .../configuration_instructblip.py             |   2 +-
 .../instructblip/modeling_instructblip.py     |   2 +-
 .../models/jamba/configuration_jamba.py       |   3 +-
 .../models/jamba/modeling_jamba.py            |   3 +-
 .../models/jukebox/configuration_jukebox.py   |   2 +-
 .../models/jukebox/tokenization_jukebox.py    |   1 -
 .../models/kosmos2/configuration_kosmos2.py   |   2 +-
 .../models/kosmos2/modeling_kosmos2.py        |   3 +-
 .../models/layoutlm/configuration_layoutlm.py |   3 +-
 .../models/layoutlm/modeling_layoutlm.py      |   3 +-
 .../models/layoutlm/modeling_tf_layoutlm.py   |   3 +-
 .../models/layoutlm/tokenization_layoutlm.py  |   2 +-
 .../layoutlm/tokenization_layoutlm_fast.py    |   2 +-
 .../layoutlmv2/configuration_layoutlmv2.py    |   2 +-
 .../models/layoutlmv2/modeling_layoutlmv2.py  |   2 +-
 .../layoutlmv3/configuration_layoutlmv3.py    |   2 +-
 .../layoutlmv3/modeling_tf_layoutlmv3.py      |   1 -
 .../models/layoutxlm/processing_layoutxlm.py  |   1 +
 .../layoutxlm/tokenization_layoutxlm.py       |   3 +-
 .../layoutxlm/tokenization_layoutxlm_fast.py  |   3 +-
 .../models/led/configuration_led.py           |   2 +-
 src/transformers/models/led/modeling_led.py   |   3 +-
 .../models/led/modeling_tf_led.py             |   3 +-
 .../models/levit/configuration_levit.py       |   2 +-
 .../levit/convert_levit_timm_to_pytorch.py    |   1 -
 .../models/levit/modeling_levit.py            |   2 +-
 .../models/lilt/configuration_lilt.py         |   2 +-
 .../models/llama/configuration_llama.py       |   2 +-
 .../models/llama/modeling_flax_llama.py       |   1 +
 .../models/llama/tokenization_llama.py        |   1 +
 .../models/llava/configuration_llava.py       |   2 +-
 .../models/llava/processing_llava.py          |   1 -
 .../llava_next/configuration_llava_next.py    |   2 +-
 .../longformer/configuration_longformer.py    |   3 +-
 ...r_original_pytorch_lightning_to_pytorch.py |   1 -
 .../longformer/modeling_tf_longformer.py      |   1 -
 .../tokenization_longformer_fast.py           |   1 +
 .../models/longt5/configuration_longt5.py     |   3 +-
 .../convert_longt5x_checkpoint_to_flax.py     |   6 +-
 .../models/longt5/modeling_flax_longt5.py     |   3 +-
 .../models/longt5/modeling_longt5.py          |   3 +-
 .../models/luke/configuration_luke.py         |   2 +-
 .../models/lxmert/configuration_lxmert.py     |   3 +-
 ...xmert_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/lxmert/modeling_lxmert.py          |   3 +-
 .../models/lxmert/modeling_tf_lxmert.py       |   3 +-
 .../models/m2m_100/configuration_m2m_100.py   |   3 +-
 .../models/m2m_100/tokenization_m2m_100.py    |   1 +
 .../models/marian/configuration_marian.py     |   3 +-
 .../models/marian/modeling_flax_marian.py     |   2 +-
 .../models/marian/modeling_marian.py          |   1 -
 .../models/marian/modeling_tf_marian.py       |   3 +-
 .../models/markuplm/configuration_markuplm.py |   2 +-
 .../models/markuplm/modeling_markuplm.py      |   2 +-
 .../models/markuplm/processing_markuplm.py    |   1 +
 .../mask2former/configuration_mask2former.py  |   3 +-
 .../mask2former/modeling_mask2former.py       |   2 +-
 .../maskformer/configuration_maskformer.py    |   3 +-
 .../configuration_maskformer_swin.py          |   2 +-
 .../convert_maskformer_resnet_to_pytorch.py   |   1 -
 .../convert_maskformer_swin_to_pytorch.py     |   1 -
 .../models/maskformer/modeling_maskformer.py  |   2 +-
 .../models/mbart/configuration_mbart.py       |   3 +-
 .../models/mbart/modeling_flax_mbart.py       |   2 +-
 .../models/mbart/modeling_mbart.py            |   3 +-
 .../models/mbart/modeling_tf_mbart.py         |   3 +-
 .../models/mega/configuration_mega.py         |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 +
 .../configuration_megatron_bert.py            |   2 +-
 .../megatron_bert/modeling_megatron_bert.py   |   3 +-
 .../models/mgp_str/configuration_mgp_str.py   |   2 +-
 .../models/mgp_str/modeling_mgp_str.py        |   2 +-
 .../models/mistral/configuration_mistral.py   |   2 +-
 .../models/mistral/modeling_flax_mistral.py   |   3 +-
 .../models/mixtral/configuration_mixtral.py   |   2 +-
 .../models/mixtral/modeling_mixtral.py        |   3 +-
 .../models/mluke/tokenization_mluke.py        |   3 +-
 .../mobilebert/configuration_mobilebert.py    |   3 +-
 .../mobilebert/modeling_tf_mobilebert.py      |   3 +-
 .../mobilebert/tokenization_mobilebert.py     |   1 -
 .../configuration_mobilenet_v1.py             |   2 +-
 ...nvert_original_tf_checkpoint_to_pytorch.py |   1 -
 .../mobilenet_v1/modeling_mobilenet_v1.py     |   3 +-
 .../configuration_mobilenet_v2.py             |   2 +-
 ...nvert_original_tf_checkpoint_to_pytorch.py |   1 -
 .../mobilenet_v2/modeling_mobilenet_v2.py     |  27 ++--
 .../mobilevit/configuration_mobilevit.py      |   2 +-
 .../mobilevit/convert_mlcvnets_to_pytorch.py  |   1 -
 .../models/mobilevit/modeling_mobilevit.py    |   3 +-
 .../models/mobilevit/modeling_tf_mobilevit.py |   2 +-
 .../mobilevitv2/configuration_mobilevitv2.py  |   2 +-
 .../convert_mlcvnets_to_pytorch.py            |   1 -
 .../mobilevitv2/modeling_mobilevitv2.py       |   3 +-
 .../models/mpnet/configuration_mpnet.py       |   2 +-
 .../models/mpnet/modeling_mpnet.py            |   1 -
 .../models/mpnet/modeling_tf_mpnet.py         |   3 +-
 .../models/mpt/configuration_mpt.py           |   3 +-
 .../models/mra/configuration_mra.py           |   2 +-
 src/transformers/models/mra/modeling_mra.py   |   3 +-
 .../models/mt5/configuration_mt5.py           |   3 +-
 .../models/mt5/modeling_flax_mt5.py           |   2 +-
 src/transformers/models/mt5/modeling_mt5.py   |   2 +-
 .../models/mt5/modeling_tf_mt5.py             |   2 +-
 .../models/musicgen/configuration_musicgen.py |   2 +-
 .../musicgen/convert_musicgen_transformers.py |   1 +
 .../models/musicgen/modeling_musicgen.py      |   3 +-
 .../models/musicgen/processing_musicgen.py    |   1 +
 .../configuration_musicgen_melody.py          |   2 +-
 .../convert_musicgen_melody_transformers.py   |   1 +
 .../feature_extraction_musicgen_melody.py     |   1 +
 .../modeling_musicgen_melody.py               |   3 +-
 .../processing_musicgen_melody.py             |   1 +
 .../models/mvp/configuration_mvp.py           |   3 +-
 src/transformers/models/mvp/modeling_mvp.py   |   3 +-
 .../models/nat/configuration_nat.py           |   2 +-
 src/transformers/models/nat/modeling_nat.py   |   3 +-
 .../models/nezha/modeling_nezha.py            |   1 -
 .../models/nllb_moe/configuration_nllb_moe.py |   3 +-
 .../models/nllb_moe/modeling_nllb_moe.py      |   3 +-
 .../models/nougat/convert_nougat_to_hf.py     |  24 ++--
 .../models/nougat/tokenization_nougat_fast.py |   1 +
 .../configuration_nystromformer.py            |   2 +-
 .../nystromformer/modeling_nystromformer.py   |   3 +-
 .../models/olmo/configuration_olmo.py         |   2 +-
 .../oneformer/configuration_oneformer.py      |   1 +
 .../models/oneformer/modeling_oneformer.py    |   3 +-
 .../models/openai/configuration_openai.py     |   2 +-
 ...penai_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/openai/modeling_openai.py          |   1 -
 .../models/openai/modeling_tf_openai.py       |   2 +-
 .../models/openai/tokenization_openai.py      |   1 -
 .../models/openai/tokenization_openai_fast.py |   1 -
 .../models/opt/configuration_opt.py           |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../models/opt/modeling_flax_opt.py           |   2 +-
 src/transformers/models/opt/modeling_opt.py   |   3 +-
 .../models/opt/modeling_tf_opt.py             |   3 +-
 .../models/owlv2/configuration_owlv2.py       |   2 +-
 .../models/owlv2/modeling_owlv2.py            |   2 +-
 .../models/owlvit/configuration_owlvit.py     |   2 +-
 .../models/owlvit/modeling_owlvit.py          |   2 +-
 .../paligemma/configuration_paligemma.py      |   2 +-
 .../convert_paligemma_weights_to_hf.py        |   4 +-
 .../models/paligemma/modeling_paligemma.py    |   3 +-
 .../models/paligemma/processing_paligemma.py  |   1 -
 .../configuration_patchtsmixer.py             |   2 +-
 .../patchtsmixer/modeling_patchtsmixer.py     |   2 +-
 .../models/patchtst/modeling_patchtst.py      |   2 +-
 .../models/pegasus/configuration_pegasus.py   |   2 +-
 .../models/pegasus/modeling_flax_pegasus.py   |   3 +-
 .../models/pegasus/modeling_pegasus.py        |   2 +-
 .../models/pegasus/modeling_tf_pegasus.py     |   3 +-
 .../pegasus/tokenization_pegasus_fast.py      |   3 +-
 .../pegasus_x/configuration_pegasus_x.py      |   2 +-
 .../models/pegasus_x/modeling_pegasus_x.py    |   2 +-
 .../perceiver/configuration_perceiver.py      |   2 +-
 .../convert_perceiver_haiku_to_pytorch.py     |   1 -
 .../models/perceiver/modeling_perceiver.py    |   2 +-
 .../perceiver/tokenization_perceiver.py       |   3 +-
 .../persimmon/configuration_persimmon.py      |   2 +-
 .../models/persimmon/modeling_persimmon.py    |   3 +-
 .../models/phi/configuration_phi.py           |   3 +-
 src/transformers/models/phi/modeling_phi.py   |   3 +-
 .../models/phi3/configuration_phi3.py         |   3 +-
 src/transformers/models/phi3/modeling_phi3.py |   2 +-
 .../models/phobert/tokenization_phobert.py    |   3 +-
 .../pix2struct/configuration_pix2struct.py    |   2 +-
 .../pix2struct/image_processing_pix2struct.py |   1 +
 .../models/pix2struct/modeling_pix2struct.py  |   2 +-
 .../models/plbart/configuration_plbart.py     |   3 +-
 .../models/plbart/modeling_plbart.py          |   3 +-
 .../poolformer/configuration_poolformer.py    |   3 +-
 .../models/poolformer/modeling_poolformer.py  |   3 +-
 .../pop2piano/configuration_pop2piano.py      |   3 +-
 .../convert_pop2piano_weights_to_hf.py        |   4 +-
 .../pop2piano/feature_extraction_pop2piano.py |   2 +-
 .../models/pop2piano/modeling_pop2piano.py    |   3 +-
 .../models/pop2piano/processing_pop2piano.py  |   2 +-
 .../prophetnet/configuration_prophetnet.py    |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../models/prophetnet/modeling_prophetnet.py  |   2 +-
 .../models/pvt/configuration_pvt.py           |   2 +-
 .../models/pvt/convert_pvt_to_pytorch.py      |   1 -
 src/transformers/models/pvt/modeling_pvt.py   |   2 +-
 .../models/qdqbert/configuration_qdqbert.py   |   2 +-
 .../models/qdqbert/modeling_qdqbert.py        |   3 +-
 .../models/qwen2/configuration_qwen2.py       |   2 +-
 .../models/qwen2/modeling_qwen2.py            |   3 +-
 .../qwen2_moe/configuration_qwen2_moe.py      |   2 +-
 .../models/rag/configuration_rag.py           |   3 +-
 .../models/rag/modeling_tf_rag.py             |   1 -
 .../models/rag/tokenization_rag.py            |   1 +
 .../models/realm/configuration_realm.py       |   2 +-
 .../models/realm/modeling_realm.py            |   2 +-
 .../configuration_recurrent_gemma.py          |   2 +-
 .../modeling_recurrent_gemma.py               |   2 +-
 .../models/reformer/configuration_reformer.py |   2 +-
 ...ert_reformer_trax_checkpoint_to_pytorch.py |   1 -
 .../models/reformer/tokenization_reformer.py  |   3 +-
 .../reformer/tokenization_reformer_fast.py    |   3 +-
 .../models/regnet/configuration_regnet.py     |   2 +-
 .../regnet/convert_regnet_to_pytorch.py       |   1 -
 .../models/regnet/modeling_regnet.py          |   2 +-
 .../models/regnet/modeling_tf_regnet.py       |   2 +-
 .../models/rembert/configuration_rembert.py   |   3 +-
 ...onvert_rembert_tf_checkpoint_to_pytorch.py |   1 -
 .../models/rembert/modeling_rembert.py        |   3 +-
 .../models/rembert/modeling_tf_rembert.py     |   3 +-
 .../models/rembert/tokenization_rembert.py    |   1 -
 .../rembert/tokenization_rembert_fast.py      |   3 +-
 .../models/resnet/configuration_resnet.py     |   2 +-
 .../resnet/convert_resnet_to_pytorch.py       |   1 -
 .../models/resnet/modeling_resnet.py          |   2 +-
 .../models/resnet/modeling_tf_resnet.py       |   2 +-
 .../models/roberta/configuration_roberta.py   |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../models/roberta/modeling_tf_roberta.py     |   3 +-
 .../roberta/tokenization_roberta_fast.py      |   1 +
 .../configuration_roberta_prelayernorm.py     |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../modeling_flax_roberta_prelayernorm.py     |   3 +-
 .../modeling_tf_roberta_prelayernorm.py       |   3 +-
 .../models/roc_bert/configuration_roc_bert.py |   2 +-
 .../models/roc_bert/modeling_roc_bert.py      |   2 +-
 .../models/roformer/configuration_roformer.py |   2 +-
 ...ormer_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/roformer/modeling_flax_roformer.py |   2 +-
 .../models/roformer/modeling_roformer.py      |   3 +-
 .../models/roformer/modeling_tf_roformer.py   |   3 +-
 .../roformer/tokenization_roformer_fast.py    |   1 +
 .../models/rwkv/configuration_rwkv.py         |   2 +-
 .../rwkv/convert_rwkv_checkpoint_to_hf.py     |   1 -
 .../models/sam/configuration_sam.py           |   3 +-
 .../models/sam/convert_sam_to_hf.py           |   1 +
 .../models/sam/image_processing_sam.py        |   1 +
 src/transformers/models/sam/modeling_sam.py   |   2 +-
 .../models/sam/modeling_tf_sam.py             |   1 -
 src/transformers/models/sam/processing_sam.py |   1 +
 .../configuration_seamless_m4t.py             |   2 +-
 .../seamless_m4t/convert_fairseq2_to_hf.py    |   3 +-
 .../seamless_m4t/modeling_seamless_m4t.py     |   3 +-
 .../seamless_m4t/tokenization_seamless_m4t.py |   1 +
 .../tokenization_seamless_m4t_fast.py         |   1 +
 .../configuration_seamless_m4t_v2.py          |   2 +-
 .../seamless_m4t_v2/convert_fairseq2_to_hf.py |   3 +-
 .../modeling_seamless_m4t_v2.py               |   3 +-
 .../segformer/configuration_segformer.py      |   2 +-
 .../convert_segformer_original_to_pytorch.py  |   1 -
 .../models/segformer/modeling_segformer.py    |   3 +-
 .../models/segformer/modeling_tf_segformer.py |   3 +-
 .../models/seggpt/configuration_seggpt.py     |   3 +-
 .../models/seggpt/convert_seggpt_to_hf.py     |   1 -
 .../models/seggpt/modeling_seggpt.py          |   3 +-
 .../models/sew/configuration_sew.py           |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 src/transformers/models/sew/modeling_sew.py   |   2 +-
 .../models/sew_d/configuration_sew_d.py       |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../models/sew_d/modeling_sew_d.py            |   2 +-
 .../models/siglip/configuration_siglip.py     |   2 +-
 .../models/siglip/convert_siglip_to_hf.py     |   1 -
 .../models/siglip/modeling_siglip.py          |   3 +-
 .../models/siglip/tokenization_siglip.py      |   2 +-
 ...rt_wav2vec2_seq2seq_original_to_pytorch.py |   1 -
 ...xt_wav2vec2_seq2seq_original_to_pytorch.py |   1 -
 .../modeling_flax_speech_encoder_decoder.py   |   2 +-
 .../modeling_speech_encoder_decoder.py        |   3 +-
 .../configuration_speech_to_text.py           |   2 +-
 .../speech_to_text/modeling_speech_to_text.py |   2 +-
 .../modeling_tf_speech_to_text.py             |   3 +-
 .../processing_speech_to_text.py              |   1 +
 .../tokenization_speech_to_text.py            |   1 +
 .../configuration_speech_to_text_2.py         |   2 +-
 .../modeling_speech_to_text_2.py              |   3 +-
 .../processing_speech_to_text_2.py            |   1 +
 .../models/speecht5/configuration_speecht5.py |   2 +-
 .../models/speecht5/modeling_speecht5.py      |   2 +-
 .../models/speecht5/tokenization_speecht5.py  |   1 -
 .../models/splinter/configuration_splinter.py |   2 +-
 .../models/splinter/modeling_splinter.py      |   3 +-
 .../squeezebert/configuration_squeezebert.py  |   3 +-
 .../squeezebert/modeling_squeezebert.py       |   3 +-
 .../models/stablelm/configuration_stablelm.py |   2 +-
 .../models/stablelm/modeling_stablelm.py      |   3 +-
 .../starcoder2/configuration_starcoder2.py    |   2 +-
 .../models/starcoder2/modeling_starcoder2.py  |   3 +-
 .../models/superpoint/modeling_superpoint.py  |   1 +
 .../swiftformer/configuration_swiftformer.py  |   2 +-
 .../convert_swiftformer_original_to_hf.py     |   1 -
 .../swiftformer/modeling_swiftformer.py       |   3 +-
 .../swiftformer/modeling_tf_swiftformer.py    |   3 +-
 .../models/swin/configuration_swin.py         |   2 +-
 .../swin/convert_swin_simmim_to_pytorch.py    |  12 +-
 .../swin/convert_swin_timm_to_pytorch.py      |  12 +-
 src/transformers/models/swin/modeling_swin.py |   3 +-
 .../models/swin/modeling_tf_swin.py           |   3 +-
 .../models/swin2sr/configuration_swin2sr.py   |   2 +-
 .../convert_swin2sr_original_to_pytorch.py    |  24 ++--
 .../models/swin2sr/modeling_swin2sr.py        |   3 +-
 .../models/swinv2/configuration_swinv2.py     |   2 +-
 .../swinv2/convert_swinv2_timm_to_pytorch.py  |  18 +--
 .../models/swinv2/modeling_swinv2.py          |   3 +-
 .../configuration_switch_transformers.py      |   3 +-
 .../modeling_switch_transformers.py           |   3 +-
 .../models/t5/configuration_t5.py             |   3 +-
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |   1 -
 .../t5/convert_t5x_checkpoint_to_flax.py      | 126 ++++++++--------
 .../models/t5/modeling_flax_t5.py             |   3 +-
 src/transformers/models/t5/modeling_t5.py     |   3 +-
 src/transformers/models/t5/modeling_tf_t5.py  |   3 +-
 src/transformers/models/t5/tokenization_t5.py |   3 +-
 .../models/t5/tokenization_t5_fast.py         |   3 +-
 .../configuration_table_transformer.py        |   3 +-
 .../convert_table_transformer_to_hf.py        |   1 -
 ...convert_table_transformer_to_hf_no_timm.py |   1 -
 .../modeling_table_transformer.py             |   3 +-
 .../models/tapas/configuration_tapas.py       |   1 -
 ...tapas_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/tapas/modeling_tapas.py            |   1 -
 .../models/tapas/modeling_tf_tapas.py         |   1 -
 .../models/tapas/tokenization_tapas.py        |   3 +-
 .../configuration_time_series_transformer.py  |   2 +-
 .../modeling_time_series_transformer.py       |   2 +-
 .../timesformer/configuration_timesformer.py  |   2 +-
 .../timesformer/modeling_timesformer.py       |   3 +-
 .../configuration_timm_backbone.py            |   2 +-
 .../models/trocr/configuration_trocr.py       |   2 +-
 .../trocr/convert_trocr_unilm_to_pytorch.py   |   1 -
 .../models/trocr/modeling_trocr.py            |   3 +-
 .../models/trocr/processing_trocr.py          |   1 +
 .../models/tvlt/configuration_tvlt.py         |   2 +-
 .../models/tvlt/image_processing_tvlt.py      |   1 +
 src/transformers/models/tvlt/modeling_tvlt.py |   3 +-
 .../models/tvp/configuration_tvp.py           |   2 +-
 src/transformers/models/tvp/processing_tvp.py |   1 -
 .../models/udop/configuration_udop.py         |   3 +-
 .../models/udop/convert_udop_to_hf.py         |   1 -
 src/transformers/models/udop/modeling_udop.py |   2 +-
 .../models/udop/tokenization_udop.py          |   3 +-
 .../models/udop/tokenization_udop_fast.py     |   3 +-
 .../models/umt5/configuration_umt5.py         |   3 +-
 .../convert_umt5_checkpoint_to_pytorch.py     |   6 +-
 src/transformers/models/umt5/modeling_umt5.py |   2 +-
 .../unispeech/configuration_unispeech.py      |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../models/unispeech/modeling_unispeech.py    |   2 +-
 .../configuration_unispeech_sat.py            |   2 +-
 ...ch_original_s3prl_checkpoint_to_pytorch.py |   1 -
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../unispeech_sat/modeling_unispeech_sat.py   |   2 +-
 .../models/univnet/configuration_univnet.py   |   2 +-
 .../models/univnet/modeling_univnet.py        |   2 +-
 .../models/upernet/configuration_upernet.py   |   3 +-
 .../models/upernet/modeling_upernet.py        |   2 +-
 .../video_llava/configuration_video_llava.py  |   2 +-
 .../video_llava/modeling_video_llava.py       |   3 +-
 .../video_llava/processing_video_llava.py     |   1 -
 .../models/videomae/configuration_videomae.py |   2 +-
 .../models/videomae/modeling_videomae.py      |   3 +-
 .../models/vilt/configuration_vilt.py         |   2 +-
 .../vilt/convert_vilt_original_to_pytorch.py  |   1 -
 src/transformers/models/vilt/modeling_vilt.py |   2 +-
 .../models/vipllava/configuration_vipllava.py |   2 +-
 .../models/vipllava/modeling_vipllava.py      |   3 +-
 .../modeling_flax_vision_encoder_decoder.py   |   3 +-
 .../modeling_tf_vision_encoder_decoder.py     |   3 +-
 .../modeling_vision_encoder_decoder.py        |   3 +-
 .../configuration_vision_text_dual_encoder.py |   3 +-
 .../modeling_flax_vision_text_dual_encoder.py |   3 +-
 .../modeling_tf_vision_text_dual_encoder.py   |   1 -
 .../modeling_vision_text_dual_encoder.py      |   3 +-
 .../visual_bert/configuration_visual_bert.py  |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../visual_bert/modeling_visual_bert.py       |   3 +-
 .../models/vit/configuration_vit.py           |   2 +-
 .../models/vit/convert_dino_to_pytorch.py     |   1 -
 .../models/vit/convert_vit_timm_to_pytorch.py |   1 -
 .../models/vit/modeling_tf_vit.py             |   3 +-
 src/transformers/models/vit/modeling_vit.py   |   3 +-
 .../vit_hybrid/configuration_vit_hybrid.py    |   3 +-
 .../convert_vit_hybrid_timm_to_pytorch.py     |   1 -
 .../models/vit_hybrid/modeling_vit_hybrid.py  |   3 +-
 .../models/vit_mae/configuration_vit_mae.py   |   2 +-
 .../models/vit_mae/modeling_tf_vit_mae.py     |   3 +-
 .../models/vit_mae/modeling_vit_mae.py        |   3 +-
 .../models/vit_msn/configuration_vit_msn.py   |   3 +-
 .../models/vit_msn/modeling_vit_msn.py        |   3 +-
 .../models/vitdet/configuration_vitdet.py     |   3 +-
 .../models/vitdet/modeling_vitdet.py          |   3 +-
 .../models/vitmatte/configuration_vitmatte.py |   2 +-
 .../models/vitmatte/modeling_vitmatte.py      |   2 +-
 .../models/vits/configuration_vits.py         |   3 +-
 src/transformers/models/vits/modeling_vits.py |   2 +-
 .../models/vits/tokenization_vits.py          |   1 -
 .../models/vivit/configuration_vivit.py       |   2 +-
 .../vivit/convert_vivit_flax_to_pytorch.py    |   1 +
 .../models/vivit/image_processing_vivit.py    |   1 +
 .../models/vivit/modeling_vivit.py            |   3 +-
 .../models/wav2vec2/configuration_wav2vec2.py |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 ...c2_original_s3prl_checkpoint_to_pytorch.py |   1 -
 .../models/wav2vec2/modeling_flax_wav2vec2.py |   2 +-
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |   3 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |   2 +-
 .../models/wav2vec2/processing_wav2vec2.py    |   1 +
 .../configuration_wav2vec2_bert.py            |   3 +-
 .../convert_wav2vec2_seamless_checkpoint.py   |   1 -
 .../wav2vec2_bert/modeling_wav2vec2_bert.py   |   2 +-
 .../wav2vec2_bert/processing_wav2vec2_bert.py |   1 +
 .../configuration_wav2vec2_conformer.py       |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../modeling_wav2vec2_conformer.py            |   2 +-
 .../tokenization_wav2vec2_phoneme.py          |   1 -
 .../processing_wav2vec2_with_lm.py            |   1 +
 .../models/wavlm/configuration_wavlm.py       |   2 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 ...lm_original_s3prl_checkpoint_to_pytorch.py |   1 -
 .../models/wavlm/modeling_wavlm.py            |   2 +-
 .../models/whisper/configuration_whisper.py   |   2 +-
 .../whisper/feature_extraction_whisper.py     |   1 +
 .../models/whisper/modeling_flax_whisper.py   |   2 +-
 .../models/whisper/modeling_tf_whisper.py     |   3 +-
 .../models/whisper/modeling_whisper.py        |   3 +-
 .../models/whisper/processing_whisper.py      |   1 -
 .../models/whisper/tokenization_whisper.py    |   1 +
 .../whisper/tokenization_whisper_fast.py      |   1 +
 .../models/x_clip/configuration_x_clip.py     |   2 +-
 .../models/x_clip/modeling_x_clip.py          |   3 +-
 .../models/xglm/configuration_xglm.py         |   2 +-
 .../models/xglm/modeling_flax_xglm.py         |   3 +-
 .../models/xglm/modeling_tf_xglm.py           |   3 +-
 src/transformers/models/xglm/modeling_xglm.py |   3 +-
 .../models/xglm/tokenization_xglm.py          |   1 +
 .../models/xlm/configuration_xlm.py           |   3 +-
 ..._original_pytorch_checkpoint_to_pytorch.py |   1 -
 .../models/xlm/modeling_tf_xlm.py             |   3 +-
 src/transformers/models/xlm/modeling_xlm.py   |   2 +-
 .../models/xlm/tokenization_xlm.py            |   5 +-
 .../configuration_xlm_prophetnet.py           |   3 +-
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |   3 +-
 .../xlm_roberta/configuration_xlm_roberta.py  |   3 +-
 .../xlm_roberta/modeling_tf_xlm_roberta.py    |   3 +-
 .../xlm_roberta/tokenization_xlm_roberta.py   |   3 +-
 .../tokenization_xlm_roberta_fast.py          |   3 +-
 .../configuration_xlm_roberta_xl.py           |   2 +-
 .../models/xlnet/configuration_xlnet.py       |   2 +-
 ...xlnet_original_tf_checkpoint_to_pytorch.py |   1 -
 .../models/xlnet/modeling_tf_xlnet.py         |   3 +-
 .../models/xlnet/modeling_xlnet.py            |   3 +-
 .../models/xlnet/tokenization_xlnet.py        |   3 +-
 .../models/xlnet/tokenization_xlnet_fast.py   |   3 +-
 .../models/xmod/configuration_xmod.py         |   3 +-
 .../models/yolos/configuration_yolos.py       |   2 +-
 .../models/yolos/convert_yolos_to_pytorch.py  |   1 -
 .../models/yolos/modeling_yolos.py            |   3 +-
 .../models/yoso/configuration_yoso.py         |   2 +-
 src/transformers/models/yoso/modeling_yoso.py |   3 +-
 src/transformers/optimization_tf.py           |   1 -
 src/transformers/processing_utils.py          |   2 +-
 src/transformers/quantizers/base.py           |  12 +-
 src/transformers/time_series_utils.py         |   1 +
 src/transformers/tokenization_utils.py        |  11 +-
 src/transformers/tokenization_utils_fast.py   |   5 +-
 src/transformers/trainer_callback.py          |   1 +
 src/transformers/utils/backbone_utils.py      |   2 +-
 src/transformers/utils/fx.py                  |   6 +-
 src/transformers/utils/hub.py                 |   1 +
 src/transformers/utils/logging.py             |   3 +-
 .../utils/sentencepiece_model_pb2_new.py      |   1 +
 tests/models/align/test_modeling_align.py     |   3 +-
 tests/models/altclip/test_modeling_altclip.py |   3 +-
 ..._modeling_audio_spectrogram_transformer.py |   2 +-
 .../autoformer/test_modeling_autoformer.py    |   2 +-
 tests/models/bark/test_modeling_bark.py       |   3 +-
 tests/models/bart/test_modeling_bart.py       |   3 +-
 tests/models/beit/test_modeling_beit.py       |   3 +-
 tests/models/bert/test_tokenization_bert.py   |  20 +--
 .../models/bert/test_tokenization_bert_tf.py  |   2 +-
 .../models/big_bird/test_modeling_big_bird.py |   3 +-
 .../test_modeling_bigbird_pegasus.py          |   3 +-
 tests/models/biogpt/test_modeling_biogpt.py   |   2 +-
 tests/models/bit/test_modeling_bit.py         |   3 +-
 .../blenderbot/test_modeling_blenderbot.py    |   2 +-
 .../test_tokenization_blenderbot.py           |   1 +
 .../test_modeling_blenderbot_small.py         |   2 +-
 .../test_tokenization_blenderbot_small.py     |   1 +
 tests/models/blip/test_modeling_blip.py       |   3 +-
 tests/models/blip/test_modeling_blip_text.py  |   3 +-
 tests/models/blip/test_modeling_tf_blip.py    |   3 +-
 .../models/blip/test_modeling_tf_blip_text.py |   3 +-
 tests/models/blip_2/test_modeling_blip_2.py   |   3 +-
 .../bridgetower/test_modeling_bridgetower.py  |   2 +-
 tests/models/bros/test_modeling_bros.py       |   2 +-
 tests/models/byt5/test_tokenization_byt5.py   |   4 +-
 tests/models/canine/test_modeling_canine.py   |   3 +-
 .../models/canine/test_tokenization_canine.py |   4 +-
 .../test_modeling_chinese_clip.py             |   2 +-
 tests/models/clap/test_modeling_clap.py       |   3 +-
 tests/models/clip/test_modeling_clip.py       |   3 +-
 tests/models/clip/test_modeling_tf_clip.py    |   3 +-
 tests/models/clip/test_tokenization_clip.py   |  14 +-
 tests/models/clipseg/test_modeling_clipseg.py |   3 +-
 tests/models/clvp/test_modeling_clvp.py       |   3 +-
 tests/models/cohere/test_modeling_cohere.py   |   2 +-
 .../test_modeling_conditional_detr.py         |   3 +-
 .../models/convbert/test_modeling_convbert.py |   3 +-
 .../models/convnext/test_modeling_convnext.py |   3 +-
 .../convnext/test_modeling_tf_convnext.py     |   2 +-
 .../convnextv2/test_modeling_convnextv2.py    |   3 +-
 .../convnextv2/test_modeling_tf_convnextv2.py |   2 +-
 tests/models/cpmant/test_modeling_cpmant.py   |   2 +-
 tests/models/cvt/test_modeling_cvt.py         |   3 +-
 tests/models/cvt/test_modeling_tf_cvt.py      |   3 +-
 .../data2vec/test_modeling_data2vec_audio.py  |   2 +-
 .../data2vec/test_modeling_data2vec_text.py   |   2 +-
 .../data2vec/test_modeling_data2vec_vision.py |   3 +-
 .../test_modeling_tf_data2vec_vision.py       |   2 +-
 tests/models/dbrx/test_modeling_dbrx.py       |   3 +-
 .../test_modeling_decision_transformer.py     |   3 +-
 .../test_modeling_deformable_detr.py          |   3 +-
 tests/models/deit/test_modeling_deit.py       |   3 +-
 tests/models/deit/test_modeling_tf_deit.py    |   3 +-
 .../test_modeling_depth_anything.py           |   3 +-
 tests/models/deta/test_modeling_deta.py       |   3 +-
 tests/models/detr/test_modeling_detr.py       |   3 +-
 tests/models/dinat/test_modeling_dinat.py     |   2 +-
 tests/models/dinov2/test_modeling_dinov2.py   |   3 +-
 .../models/donut/test_modeling_donut_swin.py  |   2 +-
 tests/models/dpt/test_modeling_dpt.py         |   3 +-
 .../dpt/test_modeling_dpt_auto_backbone.py    |   3 +-
 tests/models/dpt/test_modeling_dpt_hybrid.py  |   3 +-
 .../test_modeling_efficientformer.py          |   3 +-
 .../test_modeling_tf_efficientformer.py       |   2 +-
 .../test_modeling_efficientnet.py             |   3 +-
 .../electra/test_tokenization_electra.py      |  20 +--
 tests/models/encodec/test_modeling_encodec.py |   2 +-
 tests/models/ernie_m/test_modeling_ernie_m.py |   3 +-
 .../ernie_m/test_tokenization_ernie_m.py      |   2 +-
 tests/models/esm/test_modeling_esm.py         |   3 +-
 tests/models/esm/test_modeling_esmfold.py     |   3 +-
 tests/models/falcon/test_modeling_falcon.py   |   3 +-
 .../test_modeling_fastspeech2_conformer.py    |   2 +-
 tests/models/flava/test_modeling_flava.py     |   3 +-
 tests/models/fnet/test_modeling_fnet.py       |   3 +-
 .../models/focalnet/test_modeling_focalnet.py |   2 +-
 .../models/funnel/test_tokenization_funnel.py |   8 +-
 tests/models/fuyu/test_modeling_fuyu.py       |   2 +-
 tests/models/gemma/test_modeling_gemma.py     |   3 +-
 tests/models/glpn/test_modeling_glpn.py       |   3 +-
 .../models/gpt2/test_tokenization_gpt2_tf.py  |   2 +-
 tests/models/gpt_neo/test_modeling_gpt_neo.py |   3 +-
 .../models/gpt_neox/test_modeling_gpt_neox.py |   3 +-
 .../test_modeling_gpt_neox_japanese.py        |   3 +-
 .../graphormer/test_modeling_graphormer.py    |   3 +-
 .../test_modeling_grounding_dino.py           |   2 +-
 .../models/groupvit/test_modeling_groupvit.py |   3 +-
 .../groupvit/test_modeling_tf_groupvit.py     |   3 +-
 tests/models/hubert/test_modeling_hubert.py   |   3 +-
 tests/models/idefics/test_modeling_idefics.py |   2 +-
 .../idefics/test_modeling_tf_idefics.py       |   2 +-
 .../models/informer/test_modeling_informer.py |   2 +-
 .../test_modeling_instructblip.py             |   3 +-
 tests/models/jamba/test_modeling_jamba.py     |   3 +-
 tests/models/kosmos2/test_modeling_kosmos2.py |   3 +-
 .../layoutlm/test_tokenization_layoutlm.py    |   4 +-
 .../layoutlmv2/test_modeling_layoutlmv2.py    |   3 +-
 .../test_tokenization_layoutlmv2.py           |  14 +-
 .../layoutlmv3/test_modeling_layoutlmv3.py    |   2 +-
 .../layoutlmv3/test_modeling_tf_layoutlmv3.py |   2 +-
 .../layoutxlm/test_tokenization_layoutxlm.py  |   2 +-
 tests/models/led/test_modeling_led.py         |   3 +-
 tests/models/levit/test_modeling_levit.py     |   3 +-
 .../test_tokenization_longformer.py           |   3 +-
 tests/models/luke/test_modeling_luke.py       |   3 +-
 .../models/lxmert/test_tokenization_lxmert.py |   4 +-
 tests/models/m2m_100/test_modeling_m2m_100.py |   3 +-
 tests/models/marian/test_modeling_marian.py   |   2 +-
 .../markuplm/test_tokenization_markuplm.py    |   2 +-
 .../mask2former/test_modeling_mask2former.py  |   2 +-
 .../maskformer/test_modeling_maskformer.py    |   2 +-
 .../test_modeling_maskformer_swin.py          |   2 +-
 tests/models/mbart/test_modeling_mbart.py     |   3 +-
 .../test_modeling_megatron_bert.py            |   3 +-
 tests/models/mgp_str/test_modeling_mgp_str.py |   2 +-
 .../models/mgp_str/test_processor_mgp_str.py  |   2 +-
 tests/models/mixtral/test_modeling_mixtral.py |   3 +-
 .../test_tokenization_mobilebert.py           |  23 ++-
 .../test_modeling_mobilenet_v1.py             |   3 +-
 .../test_modeling_mobilenet_v2.py             |   3 +-
 .../mobilevit/test_modeling_mobilevit.py      |   3 +-
 .../mobilevit/test_modeling_tf_mobilevit.py   |   3 +-
 .../mobilevitv2/test_modeling_mobilevitv2.py  |   3 +-
 tests/models/mpnet/test_tokenization_mpnet.py |   4 +-
 tests/models/mra/test_modeling_mra.py         |   3 +-
 .../models/musicgen/test_modeling_musicgen.py |   3 +-
 .../test_modeling_musicgen_melody.py          |   3 +-
 tests/models/mvp/test_modeling_mvp.py         |   3 +-
 tests/models/nat/test_modeling_nat.py         |   2 +-
 .../models/nllb_moe/test_modeling_nllb_moe.py |   3 +-
 .../test_modeling_nystromformer.py            |   3 +-
 tests/models/olmo/test_modeling_olmo.py       |   2 +-
 .../oneformer/test_modeling_oneformer.py      |   2 +-
 tests/models/opt/test_modeling_opt.py         |   3 +-
 tests/models/owlv2/test_modeling_owlv2.py     |   3 +-
 tests/models/owlvit/test_modeling_owlvit.py   |   3 +-
 .../paligemma/test_modeling_paligemma.py      |   2 +-
 .../test_modeling_patchtsmixer.py             |   2 +-
 .../models/patchtst/test_modeling_patchtst.py |   2 +-
 tests/models/pegasus/test_modeling_pegasus.py |   2 +-
 .../pegasus_x/test_modeling_pegasus_x.py      |   3 +-
 .../perceiver/test_modeling_perceiver.py      |   2 +-
 .../perceiver/test_tokenization_perceiver.py  |   4 +-
 .../persimmon/test_modeling_persimmon.py      |   3 +-
 tests/models/phi/test_modeling_phi.py         |   3 +-
 tests/models/phi3/test_modeling_phi3.py       |   3 +-
 .../pix2struct/test_modeling_pix2struct.py    |   2 +-
 tests/models/plbart/test_modeling_plbart.py   |   3 +-
 .../poolformer/test_modeling_poolformer.py    |   3 +-
 .../pop2piano/test_modeling_pop2piano.py      |   2 +-
 .../test_tokenization_prophetnet.py           |  16 +--
 tests/models/pvt/test_modeling_pvt.py         |   3 +-
 tests/models/qdqbert/test_modeling_qdqbert.py |   3 +-
 tests/models/qwen2/test_modeling_qwen2.py     |   3 +-
 .../qwen2_moe/test_modeling_qwen2_moe.py      |   3 +-
 tests/models/realm/test_modeling_realm.py     |   2 +-
 tests/models/realm/test_tokenization_realm.py |  20 +--
 .../test_modeling_recurrent_gemma.py          |   3 +-
 tests/models/regnet/test_modeling_regnet.py   |   3 +-
 .../models/regnet/test_modeling_tf_regnet.py  |   2 +-
 tests/models/rembert/test_modeling_rembert.py |   3 +-
 .../rembert/test_tokenization_rembert.py      |   3 +-
 tests/models/resnet/test_modeling_resnet.py   |   3 +-
 .../models/resnet/test_modeling_tf_resnet.py  |   3 +-
 .../models/roc_bert/test_modeling_roc_bert.py |   2 +-
 .../roc_bert/test_tokenization_roc_bert.py    |  12 +-
 .../models/roformer/test_modeling_roformer.py |   3 +-
 tests/models/sam/test_modeling_sam.py         |   3 +-
 tests/models/sam/test_modeling_tf_sam.py      |   3 +-
 .../test_modeling_seamless_m4t.py             |   3 +-
 .../test_modeling_seamless_m4t_v2.py          |   3 +-
 .../segformer/test_modeling_segformer.py      |   3 +-
 .../segformer/test_modeling_tf_segformer.py   |   2 +-
 tests/models/seggpt/test_modeling_seggpt.py   |   3 +-
 tests/models/sew/test_modeling_sew.py         |   3 +-
 tests/models/sew_d/test_modeling_sew_d.py     |   3 +-
 tests/models/siglip/test_modeling_siglip.py   |   3 +-
 .../test_modeling_speech_to_text.py           |   2 +-
 .../test_modeling_tf_speech_to_text.py        |   2 +-
 .../test_modeling_speech_to_text_2.py         |   2 +-
 .../models/speecht5/test_modeling_speecht5.py |   2 +-
 .../models/splinter/test_modeling_splinter.py |   2 +-
 .../models/stablelm/test_modeling_stablelm.py |   3 +-
 .../starcoder2/test_modeling_starcoder2.py    |   3 +-
 .../swiftformer/test_modeling_swiftformer.py  |   3 +-
 .../test_modeling_tf_swiftformer.py           |   3 +-
 tests/models/swin/test_modeling_swin.py       |   2 +-
 tests/models/swin/test_modeling_tf_swin.py    |   3 +-
 tests/models/swin2sr/test_modeling_swin2sr.py |   3 +-
 tests/models/swinv2/test_modeling_swinv2.py   |   3 +-
 .../test_modeling_table_transformer.py        |   3 +-
 tests/models/tapas/test_tokenization_tapas.py |  20 +--
 .../test_modeling_time_series_transformer.py  |   2 +-
 .../timesformer/test_modeling_timesformer.py  |   3 +-
 tests/models/trocr/test_modeling_trocr.py     |   2 +-
 .../tvlt/test_feature_extraction_tvlt.py      |   2 +-
 .../models/tvlt/test_image_processor_tvlt.py  |   2 +-
 tests/models/tvlt/test_modeling_tvlt.py       |   2 +-
 tests/models/tvp/test_modeling_tvp.py         |   3 +-
 tests/models/udop/test_tokenization_udop.py   |   2 +-
 .../unispeech/test_modeling_unispeech.py      |   2 +-
 .../test_modeling_unispeech_sat.py            |   2 +-
 tests/models/upernet/test_modeling_upernet.py |   3 +-
 .../video_llava/test_modeling_video_llava.py  |   2 +-
 .../models/videomae/test_modeling_videomae.py |   3 +-
 tests/models/vilt/test_modeling_vilt.py       |   2 +-
 .../models/vipllava/test_modeling_vipllava.py |   2 +-
 ...test_modeling_tf_vision_encoder_decoder.py |   3 +-
 ..._modeling_flax_vision_text_dual_encoder.py |   3 +-
 ...st_modeling_tf_vision_text_dual_encoder.py |   3 +-
 .../test_modeling_vision_text_dual_encoder.py |   3 +-
 .../visual_bert/test_modeling_visual_bert.py  |   2 +-
 tests/models/vit/test_modeling_tf_vit.py      |   3 +-
 tests/models/vit/test_modeling_vit.py         |   3 +-
 .../vit_hybrid/test_modeling_vit_hybrid.py    |   3 +-
 .../vit_mae/test_modeling_tf_vit_mae.py       |   3 +-
 tests/models/vit_mae/test_modeling_vit_mae.py |   3 +-
 tests/models/vit_msn/test_modeling_vit_msn.py |   3 +-
 tests/models/vitdet/test_modeling_vitdet.py   |   3 +-
 .../models/vitmatte/test_modeling_vitmatte.py |   3 +-
 tests/models/vits/test_modeling_vits.py       |   2 +-
 tests/models/vits/test_tokenization_vits.py   |   3 +-
 tests/models/vivit/test_modeling_vivit.py     |   3 +-
 .../models/wav2vec2/test_modeling_wav2vec2.py |   2 +-
 .../test_modeling_wav2vec2_bert.py            |   3 +-
 .../test_modeling_wav2vec2_conformer.py       |   3 +-
 .../test_tokenization_wav2vec2_phoneme.py     |   1 +
 tests/models/wavlm/test_modeling_wavlm.py     |   2 +-
 .../whisper/test_modeling_tf_whisper.py       |   2 +-
 tests/models/whisper/test_modeling_whisper.py |   2 +-
 tests/models/x_clip/test_modeling_x_clip.py   |   3 +-
 tests/models/xlnet/test_tokenization_xlnet.py |   2 +-
 tests/models/yolos/test_modeling_yolos.py     |   3 +-
 tests/models/yoso/test_modeling_yoso.py       |   3 +-
 .../pytorch/run_glue_model_parallelism.py     |   2 +-
 tests/test_tokenization_common.py             |   6 +-
 tests/tokenization/test_tokenization_utils.py |   1 +
 utils/add_pipeline_model_mapping_to_test.py   |   1 -
 utils/check_copies.py                         |   4 +-
 utils/check_doc_toc.py                        |   1 -
 utils/check_docstrings.py                     |   1 +
 utils/check_doctest_list.py                   |   1 +
 utils/check_dummies.py                        |   1 +
 utils/check_repo.py                           |   1 +
 utils/check_support_list.py                   |   1 +
 utils/check_table.py                          |   1 +
 utils/custom_init_isort.py                    |   1 +
 utils/download_glue_data.py                   |   2 +-
 utils/models_to_deprecate.py                  |   1 +
 utils/release.py                              |   1 +
 utils/sort_auto_mappings.py                   |   1 +
 utils/update_metadata.py                      |   1 +
 utils/update_tiny_models.py                   |   1 -
 1172 files changed, 1555 insertions(+), 1861 deletions(-)

diff --git a/Makefile b/Makefile
index 6c1541a113..f9b2a8c9a7 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ export PYTHONPATH = src
 
 check_dirs := examples tests src utils
 
-exclude_folders := examples/research_projects
+exclude_folders :=  ""
 
 modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index d793cc3a37..a13c62e0fd 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -20,6 +20,7 @@ text file or a dataset.
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=fill-mask
 """
+
 import json
 import logging
 import math
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index caa70fe514..c4b47711d9 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -19,6 +19,7 @@ Pretraining the library models for T5-like span-masked language modeling on a te
 Here is the full list of checkpoints on the hub that can be pretrained by this script:
 https://huggingface.co/models?filter=t5
 """
+
 import json
 import logging
 import math
diff --git a/examples/flax/question-answering/utils_qa.py b/examples/flax/question-answering/utils_qa.py
index 23a46370d1..79497dbb81 100644
--- a/examples/flax/question-answering/utils_qa.py
+++ b/examples/flax/question-answering/utils_qa.py
@@ -15,6 +15,7 @@
 """
 Post-processing utilities for question answering.
 """
+
 import collections
 import json
 import logging
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index f1f6520201..d1234db015 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning a 🤗 Flax Transformers model for sequence classification on GLUE."""
+"""Finetuning a 🤗 Flax Transformers model for sequence classification on GLUE."""
+
 import json
 import logging
 import math
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 6ce06c8218..ecb52ceb08 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-tuning a 🤗 Flax Transformers model on token classification tasks (NER, POS, CHUNKS)"""
+"""Fine-tuning a 🤗 Flax Transformers model on token classification tasks (NER, POS, CHUNKS)"""
+
 import json
 import logging
 import math
diff --git a/examples/legacy/benchmarking/plot_csv_file.py b/examples/legacy/benchmarking/plot_csv_file.py
index 9a9ad9c670..aa092f5c04 100644
--- a/examples/legacy/benchmarking/plot_csv_file.py
+++ b/examples/legacy/benchmarking/plot_csv_file.py
@@ -93,14 +93,14 @@ class Plot:
                 self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
                 if can_convert_to_int(row["result"]):
                     # value is not None
-                    self.result_dict[model_name]["result"][
-                        (int(row["batch_size"]), int(row["sequence_length"]))
-                    ] = int(row["result"])
+                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
+                        int(row["result"])
+                    )
                 elif can_convert_to_float(row["result"]):
                     # value is not None
-                    self.result_dict[model_name]["result"][
-                        (int(row["batch_size"]), int(row["sequence_length"]))
-                    ] = float(row["result"])
+                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
+                        float(row["result"])
+                    )
 
     def plot(self):
         fig, ax = plt.subplots()
diff --git a/examples/legacy/benchmarking/run_benchmark.py b/examples/legacy/benchmarking/run_benchmark.py
index e2e7d4c5ea..85f266566b 100644
--- a/examples/legacy/benchmarking/run_benchmark.py
+++ b/examples/legacy/benchmarking/run_benchmark.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Benchmarking the library on inference and training """
+"""Benchmarking the library on inference and training"""
 
 from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
 
diff --git a/examples/legacy/multiple_choice/run_multiple_choice.py b/examples/legacy/multiple_choice/run_multiple_choice.py
index 4513970425..fece480cad 100644
--- a/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/examples/legacy/multiple_choice/run_multiple_choice.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
-
+"""Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
 
 import logging
 import os
diff --git a/examples/legacy/multiple_choice/utils_multiple_choice.py b/examples/legacy/multiple_choice/utils_multiple_choice.py
index e3bbc72884..6b7559c49e 100644
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
-
+"""Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension"""
 
 import csv
 import glob
diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py
index 999752485b..f5a827c15a 100644
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
-
+"""Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
 
 import argparse
 import glob
diff --git a/examples/legacy/question-answering/run_squad_trainer.py b/examples/legacy/question-answering/run_squad_trainer.py
index 7e3a6f28e0..a27cb76295 100644
--- a/examples/legacy/question-answering/run_squad_trainer.py
+++ b/examples/legacy/question-answering/run_squad_trainer.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-tuning the library models for question-answering."""
-
+"""Fine-tuning the library models for question-answering."""
 
 import logging
 import os
diff --git a/examples/legacy/run_language_modeling.py b/examples/legacy/run_language_modeling.py
index b157658656..317b2dfb48 100755
--- a/examples/legacy/run_language_modeling.py
+++ b/examples/legacy/run_language_modeling.py
@@ -20,7 +20,6 @@ GPT, GPT-2 and CTRL are fine-tuned using a causal language modeling (CLM) loss.
 using a masked language modeling (MLM) loss. XLNet is fine-tuned using a permutation language modeling (PLM) loss.
 """
 
-
 import logging
 import math
 import os
diff --git a/examples/legacy/run_openai_gpt.py b/examples/legacy/run_openai_gpt.py
index d0c21aba27..3831c1bd44 100755
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -28,6 +28,7 @@
           --output_dir ../log \
           --train_batch_size 16 \
 """
+
 import argparse
 import csv
 import logging
diff --git a/examples/legacy/run_swag.py b/examples/legacy/run_swag.py
index 66d77a1742..dbf712a71f 100755
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@@ -15,10 +15,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """BERT finetuning runner.
-   Finetuning the library models for multiple choice on SWAG (Bert).
+Finetuning the library models for multiple choice on SWAG (Bert).
 """
 
-
 import argparse
 import csv
 import glob
diff --git a/examples/legacy/run_transfo_xl.py b/examples/legacy/run_transfo_xl.py
index 1c48974f39..ce24fe13d7 100755
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@@ -14,14 +14,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Transformer XL model evaluation script.
-    Adapted from https://github.com/kimiyoung/transformer-xl.
-    In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+"""PyTorch Transformer XL model evaluation script.
+Adapted from https://github.com/kimiyoung/transformer-xl.
+In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
 
-    This script with default values evaluates a pretrained Transformer-XL on WikiText 103
+This script with default values evaluates a pretrained Transformer-XL on WikiText 103
 """
 
-
 import argparse
 import logging
 import math
diff --git a/examples/legacy/seq2seq/xla_spawn.py b/examples/legacy/seq2seq/xla_spawn.py
index 5df6bfa2d5..f9955acfa2 100644
--- a/examples/legacy/seq2seq/xla_spawn.py
+++ b/examples/legacy/seq2seq/xla_spawn.py
@@ -23,7 +23,6 @@ Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/lau
 
 """
 
-
 import importlib
 import sys
 from argparse import REMAINDER, ArgumentParser
diff --git a/examples/legacy/token-classification/run_ner.py b/examples/legacy/token-classification/run_ner.py
index c571d44a12..4fb74e78ff 100644
--- a/examples/legacy/token-classification/run_ner.py
+++ b/examples/legacy/token-classification/run_ner.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-tuning the library models for named entity recognition on CoNLL-2003. """
+"""Fine-tuning the library models for named entity recognition on CoNLL-2003."""
+
 import logging
 import os
 import sys
diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py
index e7e3a157e3..da4d8c3b60 100644
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
-
+"""Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task."""
 
 import logging
 import os
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 03d75ee3a2..b8f69b4b6f 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning any 🤗 Transformers model for image classification leveraging 🤗 Accelerate."""
+"""Finetuning any 🤗 Transformers model for image classification leveraging 🤗 Accelerate."""
+
 import argparse
 import json
 import logging
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index af66ff2a84..f0889f3d28 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning 🤗 Transformers model for object detection with Accelerate."""
+"""Finetuning 🤗 Transformers model for object detection with Accelerate."""
 
 import argparse
 import json
diff --git a/examples/pytorch/question-answering/trainer_qa.py b/examples/pytorch/question-answering/trainer_qa.py
index 0e82e6b816..10428a2b77 100644
--- a/examples/pytorch/question-answering/trainer_qa.py
+++ b/examples/pytorch/question-answering/trainer_qa.py
@@ -15,6 +15,7 @@
 """
 A subclass of `Trainer` specific to Question-Answering tasks
 """
+
 import math
 import time
 
diff --git a/examples/pytorch/question-answering/trainer_seq2seq_qa.py b/examples/pytorch/question-answering/trainer_seq2seq_qa.py
index dea184e908..ff9e2d7bc3 100644
--- a/examples/pytorch/question-answering/trainer_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/trainer_seq2seq_qa.py
@@ -15,6 +15,7 @@
 """
 A subclass of `Trainer` specific to Question-Answering tasks
 """
+
 import math
 import time
 from typing import Dict, List, Optional
diff --git a/examples/pytorch/question-answering/utils_qa.py b/examples/pytorch/question-answering/utils_qa.py
index 23a46370d1..79497dbb81 100644
--- a/examples/pytorch/question-answering/utils_qa.py
+++ b/examples/pytorch/question-answering/utils_qa.py
@@ -15,6 +15,7 @@
 """
 Post-processing utilities for question answering.
 """
+
 import collections
 import json
 import logging
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index b606740535..6521657e3c 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning any 🤗 Transformers model supported by AutoModelForSemanticSegmentation for semantic segmentation."""
+"""Finetuning any 🤗 Transformers model supported by AutoModelForSemanticSegmentation for semantic segmentation."""
 
 import argparse
 import json
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index 9ec37fbfd1..9592a1f6e4 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-""" Pre-Training a 🤗 Wav2Vec2 model on unlabeled audio data """
+"""Pre-Training a 🤗 Wav2Vec2 model on unlabeled audio data"""
 
 import argparse
 import math
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index 1f02ad145b..6a05f342b3 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
+"""Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
 
 import functools
 import json
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
index 2f8f0d6e2b..bf3241c61d 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Fine-tuning a 🤗 Transformers CTC adapter model for automatic speech recognition"""
+"""Fine-tuning a 🤗 Transformers CTC adapter model for automatic speech recognition"""
 
 import functools
 import json
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index 16a613e025..b5da2063b6 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for text classification."""
+"""Finetuning the library models for text classification."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 751b208e9a..c9d4ec8b10 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 338c28b184..6a8123f076 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning a 🤗 Transformers model for sequence classification on GLUE."""
+"""Finetuning a 🤗 Transformers model for sequence classification on GLUE."""
+
 import argparse
 import json
 import logging
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index b38cd80aeb..127f06e0f6 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -14,8 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM).
-    Adapted from `examples/text-classification/run_glue.py`"""
+"""Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM).
+Adapted from `examples/text-classification/run_glue.py`"""
 
 import logging
 import os
diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py
index 557b75572c..0e21a24268 100755
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -14,9 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
-"""
-
+"""Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/CTRL/Transformer-XL/XLNet)"""
 
 import argparse
 import inspect
diff --git a/examples/pytorch/text-generation/run_generation_contrastive_search.py b/examples/pytorch/text-generation/run_generation_contrastive_search.py
index a48529fb30..ba4c9a77e9 100755
--- a/examples/pytorch/text-generation/run_generation_contrastive_search.py
+++ b/examples/pytorch/text-generation/run_generation_contrastive_search.py
@@ -13,13 +13,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" The examples of running contrastive search on the auto-APIs;
+"""The examples of running contrastive search on the auto-APIs;
 
 Running this example:
 python run_generation_contrastive_search.py --model_name_or_path=openai-community/gpt2-large --penalty_alpha=0.6 --k=4 --length=256
 """
 
-
 import argparse
 import logging
 
diff --git a/examples/pytorch/xla_spawn.py b/examples/pytorch/xla_spawn.py
index 5df6bfa2d5..f9955acfa2 100644
--- a/examples/pytorch/xla_spawn.py
+++ b/examples/pytorch/xla_spawn.py
@@ -23,7 +23,6 @@ Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/lau
 
 """
 
-
 import importlib
 import sys
 from argparse import REMAINDER, ArgumentParser
diff --git a/examples/research_projects/adversarial/run_hans.py b/examples/research_projects/adversarial/run_hans.py
index 3affbb7a69..23625dfa7e 100644
--- a/examples/research_projects/adversarial/run_hans.py
+++ b/examples/research_projects/adversarial/run_hans.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on HANS."""
+"""Finetuning the library models for sequence classification on HANS."""
 
 import logging
 import os
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
index 6881bf8d18..5b30155a73 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch ALBERT model with Patience-based Early Exit. """
+"""PyTorch ALBERT model with Patience-based Early Exit."""
 
 import logging
 
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
index dfa78585a6..c1ce924a57 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch BERT model with Patience-based Early Exit. """
-
+"""PyTorch BERT model with Patience-based Early Exit."""
 
 import logging
 
diff --git a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
index 847148d557..d1ee5ddde3 100755
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Training and inference using the library models for sequence classification on GLUE (Bert, Albert) with PABEE."""
-
+"""Training and inference using the library models for sequence classification on GLUE (Bert, Albert) with PABEE."""
 
 import argparse
 import glob
diff --git a/examples/research_projects/bertabs/configuration_bertabs.py b/examples/research_projects/bertabs/configuration_bertabs.py
index 02b8f27cb3..4c65cd3395 100644
--- a/examples/research_projects/bertabs/configuration_bertabs.py
+++ b/examples/research_projects/bertabs/configuration_bertabs.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BertAbs configuration """
+"""BertAbs configuration"""
+
 import logging
 
 from transformers import PretrainedConfig
diff --git a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
index b6f5d17751..338ffd21c9 100644
--- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Convert BertExtAbs's checkpoints.
+"""Convert BertExtAbs's checkpoints.
 
 The script looks like it is doing something trivial but it is not. The "weights"
 proposed by the authors are actually the entire model pickled. We need to load
diff --git a/examples/research_projects/bertology/run_bertology.py b/examples/research_projects/bertology/run_bertology.py
index 4cb046066c..35d096f164 100644
--- a/examples/research_projects/bertology/run_bertology.py
+++ b/examples/research_projects/bertology/run_bertology.py
@@ -12,13 +12,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Bertology: this script shows how you can explore the internals of the models in the library to:
-    - compute the entropy of the head attentions
-    - compute the importance of each head
-    - prune (remove) the low importance head.
-    Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
-    which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
+"""Bertology: this script shows how you can explore the internals of the models in the library to:
+- compute the entropy of the head attentions
+- compute the importance of each head
+- prune (remove) the low importance head.
+Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
+which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
 """
+
 import argparse
 import logging
 import os
diff --git a/examples/research_projects/bertology/run_prune_gpt.py b/examples/research_projects/bertology/run_prune_gpt.py
index fa7484a787..d227634c2b 100644
--- a/examples/research_projects/bertology/run_prune_gpt.py
+++ b/examples/research_projects/bertology/run_prune_gpt.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-""" This script is adapted from the Bertology pruning code (https://github.com/huggingface/transformers/blob/783d7d2629e97c5f0c5f9ef01b8c66410275c204/examples/research_projects/bertology/run_bertology.py)
+"""This script is adapted from the Bertology pruning code (https://github.com/huggingface/transformers/blob/783d7d2629e97c5f0c5f9ef01b8c66410275c204/examples/research_projects/bertology/run_bertology.py)
 to prune GPT-like models. The author is @altsoph.
 """
 
diff --git a/examples/research_projects/distillation/distiller.py b/examples/research_projects/distillation/distiller.py
index 3ef2ba87b2..963af976f5 100644
--- a/examples/research_projects/distillation/distiller.py
+++ b/examples/research_projects/distillation/distiller.py
@@ -12,9 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" The distiller to distil the student.
-    Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+"""The distiller to distil the student.
+Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
+
 import math
 import os
 import time
diff --git a/examples/research_projects/distillation/grouped_batch_sampler.py b/examples/research_projects/distillation/grouped_batch_sampler.py
index a068f7e09e..fd126b13b5 100644
--- a/examples/research_projects/distillation/grouped_batch_sampler.py
+++ b/examples/research_projects/distillation/grouped_batch_sampler.py
@@ -12,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
-"""
+"""Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)"""
+
 import bisect
 import copy
 from collections import defaultdict
diff --git a/examples/research_projects/distillation/lm_seqs_dataset.py b/examples/research_projects/distillation/lm_seqs_dataset.py
index 8e0a5814ab..647c8f464f 100644
--- a/examples/research_projects/distillation/lm_seqs_dataset.py
+++ b/examples/research_projects/distillation/lm_seqs_dataset.py
@@ -12,9 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Dataset to distilled models
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+"""Dataset to distilled models
+adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
+
 import numpy as np
 import torch
 from torch.utils.data import Dataset
diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py
index 523d9bedb8..a1150f6b43 100644
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" This is the exact same script as `examples/question-answering/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
+"""This is the exact same script as `examples/question-answering/run_squad.py` (as of 2020, January 8th) with an additional and optional step of distillation."""
 
 import argparse
 import glob
diff --git a/examples/research_projects/distillation/scripts/binarized_data.py b/examples/research_projects/distillation/scripts/binarized_data.py
index 951530d5c7..3fc3214acf 100644
--- a/examples/research_projects/distillation/scripts/binarized_data.py
+++ b/examples/research_projects/distillation/scripts/binarized_data.py
@@ -15,6 +15,7 @@
 """
 Preprocessing script before distillation.
 """
+
 import argparse
 import logging
 import pickle
diff --git a/examples/research_projects/distillation/scripts/extract.py b/examples/research_projects/distillation/scripts/extract.py
index f60f243dec..c45821d187 100644
--- a/examples/research_projects/distillation/scripts/extract.py
+++ b/examples/research_projects/distillation/scripts/extract.py
@@ -16,6 +16,7 @@
 Preprocessing script before training the distilled model.
 Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
 """
+
 import argparse
 
 import torch
diff --git a/examples/research_projects/distillation/scripts/extract_distilbert.py b/examples/research_projects/distillation/scripts/extract_distilbert.py
index a58105f999..8637970c51 100644
--- a/examples/research_projects/distillation/scripts/extract_distilbert.py
+++ b/examples/research_projects/distillation/scripts/extract_distilbert.py
@@ -16,6 +16,7 @@
 Preprocessing script before training DistilBERT.
 Specific to BERT -> DistilBERT.
 """
+
 import argparse
 
 import torch
diff --git a/examples/research_projects/distillation/scripts/token_counts.py b/examples/research_projects/distillation/scripts/token_counts.py
index 736b564ee7..2f80bf31f4 100644
--- a/examples/research_projects/distillation/scripts/token_counts.py
+++ b/examples/research_projects/distillation/scripts/token_counts.py
@@ -15,6 +15,7 @@
 """
 Preprocessing script before training the distilled model.
 """
+
 import argparse
 import logging
 import pickle
diff --git a/examples/research_projects/distillation/train.py b/examples/research_projects/distillation/train.py
index 1acb527220..15d98ace09 100644
--- a/examples/research_projects/distillation/train.py
+++ b/examples/research_projects/distillation/train.py
@@ -16,6 +16,7 @@
 Training the distilled model.
 Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
 """
+
 import argparse
 import json
 import os
diff --git a/examples/research_projects/distillation/utils.py b/examples/research_projects/distillation/utils.py
index 6d439453fe..e86d2593bb 100644
--- a/examples/research_projects/distillation/utils.py
+++ b/examples/research_projects/distillation/utils.py
@@ -12,9 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Utils to train DistilBERT
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+"""Utils to train DistilBERT
+adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
+
 import json
 import logging
 import os
diff --git a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
index fbb165ba42..8940fab5bd 100755
--- a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
+++ b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
@@ -20,6 +20,7 @@ text file or a dataset.
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=fill-mask
 """
+
 import logging
 import os
 import sys
diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py
index 499de53207..8aea9b5e1a 100644
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@@ -1,20 +1,21 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
- Adapted From Facebook Inc, Detectron2 && Huggingface Co.
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
+Adapted From Facebook Inc, Detectron2 && Huggingface Co.
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
 import itertools
 import math
 import os
diff --git a/examples/research_projects/lxmert/processing_image.py b/examples/research_projects/lxmert/processing_image.py
index 4343cfdbce..65f8f6cd37 100644
--- a/examples/research_projects/lxmert/processing_image.py
+++ b/examples/research_projects/lxmert/processing_image.py
@@ -1,20 +1,21 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
- Adapted From Facebook Inc, Detectron2
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
+Adapted From Facebook Inc, Detectron2
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
 import sys
 from typing import Tuple
 
diff --git a/examples/research_projects/lxmert/utils.py b/examples/research_projects/lxmert/utils.py
index c75f523a08..995fbd2c19 100644
--- a/examples/research_projects/lxmert/utils.py
+++ b/examples/research_projects/lxmert/utils.py
@@ -1,20 +1,20 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal, Huggingface team :)
- Adapted From Facebook Inc, Detectron2
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal, Huggingface team :)
+Adapted From Facebook Inc, Detectron2
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
 import copy
 import fnmatch
diff --git a/examples/research_projects/lxmert/visualizing_image.py b/examples/research_projects/lxmert/visualizing_image.py
index 163d661e87..dcfd8426ff 100644
--- a/examples/research_projects/lxmert/visualizing_image.py
+++ b/examples/research_projects/lxmert/visualizing_image.py
@@ -1,20 +1,21 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
- Adapted From Facebook Inc, Detectron2
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
+Adapted From Facebook Inc, Detectron2
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
 import colorsys
 import io
 
diff --git a/examples/research_projects/mm-imdb/run_mmimdb.py b/examples/research_projects/mm-imdb/run_mmimdb.py
index c863857c41..686691e0b9 100644
--- a/examples/research_projects/mm-imdb/run_mmimdb.py
+++ b/examples/research_projects/mm-imdb/run_mmimdb.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
-
+"""Finetuning the library models for multimodal multiclass prediction on MM-IMDB dataset."""
 
 import argparse
 import glob
diff --git a/examples/research_projects/movement-pruning/counts_parameters.py b/examples/research_projects/movement-pruning/counts_parameters.py
index 89ce40baa7..c0ac53fb78 100644
--- a/examples/research_projects/movement-pruning/counts_parameters.py
+++ b/examples/research_projects/movement-pruning/counts_parameters.py
@@ -15,6 +15,7 @@
 Count remaining (non-zero) weights in the encoder (i.e. the transformer layers).
 Sparsity and remaining weights levels are equivalent: sparsity % = 100 - remaining weights %.
 """
+
 import argparse
 import os
 
diff --git a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
index 2a3bd763a2..9c7459f27a 100644
--- a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
@@ -13,10 +13,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Masked BERT model configuration. It replicates the class `~transformers.BertConfig`
+"""Masked BERT model configuration. It replicates the class `~transformers.BertConfig`
 and adapts it to the specificities of MaskedBert (`pruning_method`, `mask_init` and `mask_scale`."""
 
-
 import logging
 
 from transformers.configuration_utils import PretrainedConfig
diff --git a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
index f47395bb00..8c0b091c7d 100644
--- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
@@ -18,7 +18,6 @@
 compute the adaptive mask.
 Built on top of `transformers.models.bert.modeling_bert`"""
 
-
 import logging
 import math
 
diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py
index e2090c431e..f7103deca1 100644
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-pruning Masked BERT on sequence classification on GLUE."""
+"""Fine-pruning Masked BERT on sequence classification on GLUE."""
 
 import argparse
 import glob
diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py
index 14d92dde4e..d7b4b19112 100644
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-pruning Masked BERT for question-answering on SQuAD."""
-
+"""Fine-pruning Masked BERT for question-answering on SQuAD."""
 
 import argparse
 import glob
diff --git a/examples/research_projects/onnx/summarization/run_onnx_exporter.py b/examples/research_projects/onnx/summarization/run_onnx_exporter.py
index 889eefb4e7..fa82673270 100644
--- a/examples/research_projects/onnx/summarization/run_onnx_exporter.py
+++ b/examples/research_projects/onnx/summarization/run_onnx_exporter.py
@@ -13,9 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+""" """
 
-"""
 import argparse
 import logging
 import os
diff --git a/examples/research_projects/performer/run_mlm_performer.py b/examples/research_projects/performer/run_mlm_performer.py
index 4261d9c184..0332fe1575 100644
--- a/examples/research_projects/performer/run_mlm_performer.py
+++ b/examples/research_projects/performer/run_mlm_performer.py
@@ -19,6 +19,7 @@ text file or a dataset.
 Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 https://huggingface.co/models?filter=fill-mask
 """
+
 import logging
 import os
 import sys
diff --git a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
index 677b9c7860..7a8ea2109b 100755
--- a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
+++ b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
+"""Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
+
 import argparse
 import logging
 import os
diff --git a/examples/research_projects/quantization-qdqbert/quant_trainer.py b/examples/research_projects/quantization-qdqbert/quant_trainer.py
index 09bac19e92..132aa28490 100755
--- a/examples/research_projects/quantization-qdqbert/quant_trainer.py
+++ b/examples/research_projects/quantization-qdqbert/quant_trainer.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Helper functions for training models with pytorch-quantization"""
+
 import logging
 import re
 
diff --git a/examples/research_projects/quantization-qdqbert/utils_qa.py b/examples/research_projects/quantization-qdqbert/utils_qa.py
index fd0bc16f7e..e90d6c4747 100644
--- a/examples/research_projects/quantization-qdqbert/utils_qa.py
+++ b/examples/research_projects/quantization-qdqbert/utils_qa.py
@@ -15,6 +15,7 @@
 """
 Post-processing utilities for question answering.
 """
+
 import collections
 import json
 import logging
diff --git a/examples/research_projects/rag-end2end-retriever/eval_rag.py b/examples/research_projects/rag-end2end-retriever/eval_rag.py
index a8e7abbca6..55f4da5657 100644
--- a/examples/research_projects/rag-end2end-retriever/eval_rag.py
+++ b/examples/research_projects/rag-end2end-retriever/eval_rag.py
@@ -1,4 +1,4 @@
-""" Evaluation script for RAG models."""
+"""Evaluation script for RAG models."""
 
 import argparse
 import ast
diff --git a/examples/research_projects/rag/eval_rag.py b/examples/research_projects/rag/eval_rag.py
index a8e7abbca6..55f4da5657 100644
--- a/examples/research_projects/rag/eval_rag.py
+++ b/examples/research_projects/rag/eval_rag.py
@@ -1,4 +1,4 @@
-""" Evaluation script for RAG models."""
+"""Evaluation script for RAG models."""
 
 import argparse
 import ast
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
index ebf33eb01d..0f7abde37e 100755
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
+"""Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition"""
 
 import functools
 import json
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
index 8a8eda851b..0fb567aba0 100644
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-""" Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition in streaming mode"""
+"""Fine-tuning a 🤗 Transformers CTC model for automatic speech recognition in streaming mode"""
 
 import logging
 import os
diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py
index 499de53207..8aea9b5e1a 100644
--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@@ -1,20 +1,21 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
- Adapted From Facebook Inc, Detectron2 && Huggingface Co.
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
+Adapted From Facebook Inc, Detectron2 && Huggingface Co.
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
 import itertools
 import math
 import os
diff --git a/examples/research_projects/visual_bert/processing_image.py b/examples/research_projects/visual_bert/processing_image.py
index 4343cfdbce..65f8f6cd37 100644
--- a/examples/research_projects/visual_bert/processing_image.py
+++ b/examples/research_projects/visual_bert/processing_image.py
@@ -1,20 +1,21 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
- Adapted From Facebook Inc, Detectron2
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
+Adapted From Facebook Inc, Detectron2
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
 import sys
 from typing import Tuple
 
diff --git a/examples/research_projects/visual_bert/utils.py b/examples/research_projects/visual_bert/utils.py
index c75f523a08..995fbd2c19 100644
--- a/examples/research_projects/visual_bert/utils.py
+++ b/examples/research_projects/visual_bert/utils.py
@@ -1,20 +1,20 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal, Huggingface team :)
- Adapted From Facebook Inc, Detectron2
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal, Huggingface team :)
+Adapted From Facebook Inc, Detectron2
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
 import copy
 import fnmatch
diff --git a/examples/research_projects/visual_bert/visualizing_image.py b/examples/research_projects/visual_bert/visualizing_image.py
index 163d661e87..dcfd8426ff 100644
--- a/examples/research_projects/visual_bert/visualizing_image.py
+++ b/examples/research_projects/visual_bert/visualizing_image.py
@@ -1,20 +1,21 @@
 """
- coding=utf-8
- Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
- Adapted From Facebook Inc, Detectron2
+coding=utf-8
+Copyright 2018, Antonio Mendoza Hao Tan, Mohit Bansal
+Adapted From Facebook Inc, Detectron2
 
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.import copy
+"""
 
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.import copy
- """
 import colorsys
 import io
 
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index e01ccbf448..a467b3c6eb 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-""" Fine-tuning a 🤗 Transformers pretrained speech model on the XTREME-S benchmark tasks"""
+"""Fine-tuning a 🤗 Transformers pretrained speech model on the XTREME-S benchmark tasks"""
 
 import json
 import logging
diff --git a/examples/tensorflow/benchmarking/plot_csv_file.py b/examples/tensorflow/benchmarking/plot_csv_file.py
index 9a9ad9c670..aa092f5c04 100644
--- a/examples/tensorflow/benchmarking/plot_csv_file.py
+++ b/examples/tensorflow/benchmarking/plot_csv_file.py
@@ -93,14 +93,14 @@ class Plot:
                 self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
                 if can_convert_to_int(row["result"]):
                     # value is not None
-                    self.result_dict[model_name]["result"][
-                        (int(row["batch_size"]), int(row["sequence_length"]))
-                    ] = int(row["result"])
+                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
+                        int(row["result"])
+                    )
                 elif can_convert_to_float(row["result"]):
                     # value is not None
-                    self.result_dict[model_name]["result"][
-                        (int(row["batch_size"]), int(row["sequence_length"]))
-                    ] = float(row["result"])
+                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
+                        float(row["result"])
+                    )
 
     def plot(self):
         fig, ax = plt.subplots()
diff --git a/examples/tensorflow/benchmarking/run_benchmark_tf.py b/examples/tensorflow/benchmarking/run_benchmark_tf.py
index 25aabc5f51..4a43cf7d91 100755
--- a/examples/tensorflow/benchmarking/run_benchmark_tf.py
+++ b/examples/tensorflow/benchmarking/run_benchmark_tf.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Benchmarking the library on inference and training in TensorFlow"""
+"""Benchmarking the library on inference and training in TensorFlow"""
 
 from transformers import HfArgumentParser, TensorFlowBenchmark, TensorFlowBenchmarkArguments
 
diff --git a/examples/tensorflow/question-answering/utils_qa.py b/examples/tensorflow/question-answering/utils_qa.py
index 23a46370d1..79497dbb81 100644
--- a/examples/tensorflow/question-answering/utils_qa.py
+++ b/examples/tensorflow/question-answering/utils_qa.py
@@ -15,6 +15,7 @@
 """
 Post-processing utilities for question answering.
 """
+
 import collections
 import json
 import logging
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index d21bd1ec46..9f3893e887 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import json
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index e4f0644d12..379f367403 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-tuning the library models for sequence classification."""
+"""Fine-tuning the library models for sequence classification."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import json
diff --git a/setup.py b/setup.py
index efdb5342d0..9ad7c84b58 100644
--- a/setup.py
+++ b/setup.py
@@ -156,7 +156,7 @@ _deps = [
     "rhoknp>=1.1.0,<1.3.1",
     "rjieba",
     "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff==0.1.5",
+    "ruff==0.4.4",
     "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses",
     "safetensors>=0.4.1",
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
index c5c73550c1..4dc408bfa2 100644
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@@ -16,6 +16,7 @@
 Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
 and remove unnecessary dependencies.
 """
+
 import warnings
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py
index 3c5c877a45..3d4f588a5b 100644
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-    Benchmarking the library on inference and training in PyTorch.
+Benchmarking the library on inference and training in PyTorch.
 """
 
-
 import timeit
 from typing import Callable, Optional
 
diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py
index c813591be0..f680222955 100644
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-    Benchmarking the library on inference and training in PyTorch.
+Benchmarking the library on inference and training in PyTorch.
 """
 
-
 import random
 import timeit
 from functools import wraps
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index a71b1fb65a..a721f98cfd 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -249,7 +249,6 @@ def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_i
     else:
 
         class MemoryMeasureProcess(Process):
-
             """
             `MemoryMeasureProcess` inherits from `Process` and overwrites its `run()` method. Used to measure the
             memory usage of a process
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 86b946d162..b3d76aff9c 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Configuration base class and utilities."""
-
+"""Configuration base class and utilities."""
 
 import copy
 import json
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index e3270bb9de..051f1d148a 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -400,7 +400,7 @@ def optimize(onnx_model_path: Path) -> Path:
     sess_option.optimized_model_filepath = opt_model_path.as_posix()
     _ = InferenceSession(onnx_model_path.as_posix(), sess_option)
 
-    print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}")
+    print(f"Optimized model has been written at {opt_model_path}: \N{HEAVY CHECK MARK}")
     print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\")
 
     return opt_model_path
@@ -475,7 +475,7 @@ def quantize(onnx_model_path: Path) -> Path:
     quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized")
 
     # Save model
-    print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}")
+    print(f"Quantized model has been written at {quantized_model_path}: \N{HEAVY CHECK MARK}")
     onnx.save_model(quantizer.model.model, quantized_model_path.as_posix())
 
     return quantized_model_path
@@ -489,9 +489,9 @@ def verify(path: Path):
     try:
         onnx_options = SessionOptions()
         _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"])
-        print(f"Model {path} correctly loaded: \N{heavy check mark}")
+        print(f"Model {path} correctly loaded: \N{HEAVY CHECK MARK}")
     except RuntimeException as re:
-        print(f"Error while loading the model {re}: \N{heavy ballot x}")
+        print(f"Error while loading the model {re}: \N{HEAVY BALLOT X}")
 
 
 if __name__ == "__main__":
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index c544c8c9e1..3875879f0e 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Convert pytorch checkpoints to TensorFlow"""
-
+"""Convert pytorch checkpoints to TensorFlow"""
 
 import argparse
 import os
diff --git a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
index a032ee93b0..cddf18951d 100755
--- a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
+++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Convert slow tokenizers checkpoints in fast (serialization format of the `tokenizers` library)"""
+"""Convert slow tokenizers checkpoints in fast (serialization format of the `tokenizers` library)"""
 
 import argparse
 import os
diff --git a/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py b/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
index 2b003d4bc4..8ccb033b3d 100755
--- a/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
+++ b/src/transformers/convert_tf_hub_seq_to_seq_bert_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Seq2Seq TF Hub checkpoint."""
 
-
 import argparse
 
 from . import (
diff --git a/src/transformers/data/metrics/squad_metrics.py b/src/transformers/data/metrics/squad_metrics.py
index 6eea34ad9e..5d98a0bfcf 100644
--- a/src/transformers/data/metrics/squad_metrics.py
+++ b/src/transformers/data/metrics/squad_metrics.py
@@ -20,7 +20,6 @@ additional na_prob.json file is provided. This file is expected to map question
 probability that a question is unanswerable.
 """
 
-
 import collections
 import json
 import math
diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py
index 3d22968c9d..6e95a66684 100644
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GLUE processors and helpers"""
+"""GLUE processors and helpers"""
 
 import os
 import warnings
diff --git a/src/transformers/data/processors/xnli.py b/src/transformers/data/processors/xnli.py
index 3f1a11fcd6..459c5bc3a6 100644
--- a/src/transformers/data/processors/xnli.py
+++ b/src/transformers/data/processors/xnli.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XNLI utils (dataset loading and evaluation)"""
-
+"""XNLI utils (dataset loading and evaluation)"""
 
 import os
 
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 840d9cc2f5..6fd22d8c5c 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -17,6 +17,7 @@ in `integrations/deepspeed` instead.
 
 Check: https://github.com/huggingface/transformers/pull/25599
 """
+
 import warnings
 
 
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index ceca99d450..2e48650b9c 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -62,7 +62,7 @@ deps = {
     "rhoknp": "rhoknp>=1.1.0,<1.3.1",
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff": "ruff==0.1.5",
+    "ruff": "ruff==0.4.4",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",
     "safetensors": "safetensors>=0.4.1",
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index d80ffdc58c..9de22a3592 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Utilities to dynamically load objects from the Hub."""
+
 import filecmp
 import importlib
 import importlib.util
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 40717d9931..f74a3f0c40 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -13,8 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Sequence feature extraction class for common feature extractors to preprocess sequences.
+Sequence feature extraction class for common feature extractors to preprocess sequences.
 """
+
 from typing import Dict, List, Optional, Union
 
 import numpy as np
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index 44e01f8d85..46125b8fa7 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Feature extraction saving/loading class for common feature extractors.
+Feature extraction saving/loading class for common feature extractors.
 """
 
 import copy
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 9a5fee5207..eb14c60d9a 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Generation configuration class and utilities."""
+"""Generation configuration class and utilities."""
 
 import copy
 import json
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 5fd1c2f773..14da9e697a 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -387,9 +387,9 @@ class StopStringCriteria(StoppingCriteria):
             # Since this is lots of very small assignments of lists, we build it with numpy rather
             # than torch for speed + simplicity, then convert to torch at the end
             for token_idx, valid_positions in positions.items():
-                gather_vec[
-                    token_idx, max_valid_positions * i : max_valid_positions * i + len(valid_positions)
-                ] = valid_positions
+                gather_vec[token_idx, max_valid_positions * i : max_valid_positions * i + len(valid_positions)] = (
+                    valid_positions
+                )
             for token_idx, possible_end_lens in end_lens.items():
                 gather_vec[
                     token_idx,
diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py
index 297d388d54..e998d996ec 100644
--- a/src/transformers/generation/watermarking.py
+++ b/src/transformers/generation/watermarking.py
@@ -68,7 +68,6 @@ class WatermarkDetectorOutput:
 
 
 class WatermarkDetector:
-
     r"""
     Detector for detection of watermark generated text. The detector needs to be given the exact same settings that were
     given during text generation to replicate the watermark greenlist generation and so detect the watermark. This includes
diff --git a/src/transformers/integrations/aqlm.py b/src/transformers/integrations/aqlm.py
index 903d0ecdae..ee4bbcb896 100644
--- a/src/transformers/integrations/aqlm.py
+++ b/src/transformers/integrations/aqlm.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 "AQLM (Additive Quantization of Language Model) integration file"
 
-
 from ..utils import is_accelerate_available, is_aqlm_available, is_torch_available
 
 
diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py
index a83b27e95a..30427aa405 100644
--- a/src/transformers/integrations/awq.py
+++ b/src/transformers/integrations/awq.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 "AWQ (Activation aware Weight Quantization) integration file"
+
 from ..activations import ACT2FN
 from ..modeling_utils import PreTrainedModel
 from ..utils import is_auto_awq_available, is_torch_available, logging
diff --git a/src/transformers/integrations/deepspeed.py b/src/transformers/integrations/deepspeed.py
index 4754c37a1e..aae1204acf 100644
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@@ -14,6 +14,7 @@
 """
 Integration with Deepspeed
 """
+
 import copy
 import importlib.metadata as importlib_metadata
 import importlib.util
diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index 13660a0a81..3907d80a2a 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -17,6 +17,7 @@
 Integration with GGML / The file is copied and adapted from https://github.com/99991/pygguf
 with extra methods beings exposed
 """
+
 from array import array
 
 import numpy as np
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 39de7d6d32..66f8c1da8f 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -14,6 +14,7 @@
 """
 Integrations with other Python libraries.
 """
+
 import functools
 import importlib.metadata
 import importlib.util
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 4776737a37..60394f569c 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Configuration base class and utilities."""
-
+"""Configuration base class and utilities."""
 
 import copy
 import json
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index aceb462d12..4b61f7140c 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch - Flax general utilities."""
-
+"""PyTorch - Flax general utilities."""
 
 import os
 from pickle import UnpicklingError
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 163178929f..7f1367481a 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch - TF 2.0 general utilities."""
-
+"""PyTorch - TF 2.0 general utilities."""
 
 import os
 import re
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index 492ca2f65b..bae88486e1 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ALBERT model configuration"""
+"""ALBERT model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
index eecada8b43..df2a226101 100644
--- a/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ALBERT checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index ad2c2d637a..3a50eeb20e 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ALBERT model."""
-
+"""TF 2.0 ALBERT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 786f9eeafc..4068c7aad8 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for ALBERT model."""
-
+"""Tokenization classes for ALBERT model."""
 
 import os
 import unicodedata
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index e0b09a7356..eadfdcecfc 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for ALBERT model."""
-
+"""Tokenization classes for ALBERT model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py
index 9e96f5d15a..199b51153a 100644
--- a/src/transformers/models/align/configuration_align.py
+++ b/src/transformers/models/align/configuration_align.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ALIGN model configuration"""
+"""ALIGN model configuration"""
 
 import os
 from typing import TYPE_CHECKING, List, Union
diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py
index 08de9aa14f..d6e6023a26 100644
--- a/src/transformers/models/align/modeling_align.py
+++ b/src/transformers/models/align/modeling_align.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ALIGN model."""
+"""PyTorch ALIGN model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index 8bcea7eb5d..3bc97afd1c 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -16,7 +16,6 @@
 Image/Text processor class for ALIGN
 """
 
-
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index d6e533e1f8..3195d43e0b 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AltCLIP model configuration"""
+"""AltCLIP model configuration"""
+
 import os
 from typing import Union
 
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index dfccab53ea..6bffdc70a5 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch AltCLIP model."""
+"""PyTorch AltCLIP model."""
+
 import math
 from dataclasses import dataclass
 from typing import Any, List, Optional, Tuple, Union
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 9518c55d40..2814b2d7f2 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -15,6 +15,7 @@
 """
 Image/Text processor class for AltCLIP
 """
+
 import warnings
 
 from ...processing_utils import ProcessorMixin
diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index 158f1ee5d9..9e1d995dc2 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Audio Spectogram Transformer (AST) model configuration"""
-
+"""Audio Spectogram Transformer (AST) model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index 32e0f33d04..2f75d07592 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast"""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index 523ab85f14..beb249202b 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Audio Spectrogram Transformer (AST) model."""
+"""PyTorch Audio Spectrogram Transformer (AST) model."""
 
 import math
 from typing import Dict, List, Optional, Set, Tuple, Union
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index de813c43cf..6b572b2527 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Factory function to build auto-model classes."""
+
 import copy
 import importlib
 import json
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index d48b4a2005..464f80e2d9 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Config class."""
+"""Auto Config class."""
+
 import importlib
 import os
 import re
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 1dbb4eb7dc..34cb1824c1 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AutoFeatureExtractor class."""
+"""AutoFeatureExtractor class."""
+
 import importlib
 import json
 import os
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 41a1fcf960..eb21b58e20 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AutoImageProcessor class."""
+"""AutoImageProcessor class."""
+
 import importlib
 import json
 import os
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index c16ab83bdc..adfcc7af9f 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class."""
+"""Auto Model class."""
 
 import warnings
 from collections import OrderedDict
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index f8e62bf0f2..310cf5b287 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class."""
-
+"""Auto Model class."""
 
 from collections import OrderedDict
 
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 756da20dbc..2004495756 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class."""
-
+"""Auto Model class."""
 
 import warnings
 from collections import OrderedDict
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 5b5ef98bdc..4a8295cc83 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" AutoProcessor class."""
+"""AutoProcessor class."""
+
 import importlib
 import inspect
 import json
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ff323ff388..e99bc89205 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Tokenizer class."""
+"""Auto Tokenizer class."""
 
 import importlib
 import json
diff --git a/src/transformers/models/autoformer/configuration_autoformer.py b/src/transformers/models/autoformer/configuration_autoformer.py
index 4f499fd1cb..09b06f95c3 100644
--- a/src/transformers/models/autoformer/configuration_autoformer.py
+++ b/src/transformers/models/autoformer/configuration_autoformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Autoformer model configuration"""
+"""Autoformer model configuration"""
 
 from typing import List, Optional
 
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index d7e1d7a4a3..5a5b5f2439 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Autoformer model."""
+"""PyTorch Autoformer model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py
index a6bf2b546a..6dd08b65e8 100644
--- a/src/transformers/models/bark/configuration_bark.py
+++ b/src/transformers/models/bark/configuration_bark.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BARK model configuration"""
+"""BARK model configuration"""
 
 import os
 from typing import Dict, Optional, Union
diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py
index 4720a70d5c..880debe60a 100644
--- a/src/transformers/models/bark/convert_suno_to_hf.py
+++ b/src/transformers/models/bark/convert_suno_to_hf.py
@@ -1,4 +1,5 @@
 """Convert Bark checkpoint."""
+
 import argparse
 import os
 from pathlib import Path
diff --git a/src/transformers/models/bark/generation_configuration_bark.py b/src/transformers/models/bark/generation_configuration_bark.py
index 7d7d98449d..b03fd6796a 100644
--- a/src/transformers/models/bark/generation_configuration_bark.py
+++ b/src/transformers/models/bark/generation_configuration_bark.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BARK model generation configuration"""
+"""BARK model generation configuration"""
 
 import copy
 from typing import Dict
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index 0690c96e6a..9a9fa33d97 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BARK model."""
+"""PyTorch BARK model."""
+
 import math
 from typing import Dict, Optional, Tuple, Union
 
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index bb7690f9a1..a9bf55b51f 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -15,6 +15,7 @@
 """
 Processor class for Bark
 """
+
 import json
 import os
 from typing import Optional
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 1a6214c2ee..a3bc7f3865 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BART model configuration"""
+"""BART model configuration"""
+
 import warnings
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
diff --git a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
index d09b39d51e..e694d96ca0 100644
--- a/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert BART checkpoint."""
 
-
 import argparse
 import os
 from pathlib import Path
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index bcc9ac3e07..e3b2f8a61b 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BART model."""
+"""PyTorch BART model."""
+
 import copy
 import math
 import warnings
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index 6abfcdc398..507a93a8e7 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax Bart model."""
+"""Flax Bart model."""
 
 import math
 import random
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 1e38908b4a..5ebde8cba6 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Bart model."""
-
+"""TF 2.0 Bart model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index d9bd67cf51..46decddb3e 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for the BARThez model."""
-
+"""Tokenization classes for the BARThez model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index e988b0d518..df8cc7757e 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for the BARThez model."""
-
+"""Tokenization classes for the BARThez model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py
index d936be41c2..df121f26e2 100644
--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for BARTpho-syllable model."""
-
+"""Tokenization classes for BARTpho-syllable model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index 4664bdc898..6ff00b2b87 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BEiT model configuration"""
+"""BEiT model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
index 757113c8a6..c2e366d7dd 100644
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert BEiT checkpoints from the unilm repository."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index d6c5a164d7..a9b38d4ee3 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BEiT model."""
-
+"""PyTorch BEiT model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py
index 7ad3536c96..613cf6a114 100644
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BERT model configuration"""
+"""BERT model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
index 40533ede43..9dfd8da474 100644
--- a/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py
@@ -24,6 +24,7 @@ You may adapt this script to include classification/MLM/NSP/etc. heads.
 Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0).
       Models trained with never versions are not compatible with this script.
 """
+
 import argparse
 import os
 import re
diff --git a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
index 09c4e3ee6c..be904ddd7e 100755
--- a/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert BERT checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
index 651847aee7..cba1e1a2c3 100644
--- a/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py
@@ -18,6 +18,7 @@ model. The official implementation of "Token Dropping" can be found in the Tenso
 
 https://github.com/tensorflow/models/tree/master/official/projects/token_dropping
 """
+
 import argparse
 
 import tensorflow as tf
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 89885887c9..16dc2fc205 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 BERT model."""
-
+"""TF 2.0 BERT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index f645d7c08a..a8f1274663 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for Bert."""
 
-
 import collections
 import os
 import unicodedata
diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py
index 841aec5c0f..d1d1b51b65 100644
--- a/src/transformers/models/bert_generation/configuration_bert_generation.py
+++ b/src/transformers/models/bert_generation/configuration_bert_generation.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""  BertGeneration model configuration"""
+"""BertGeneration model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index 772eb123c3..b1adb9b62b 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model BertGeneration."""
-
+"""Tokenization class for model BertGeneration."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index fe5cd06f7f..58ff3d2b83 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes."""
 
-
 import collections
 import copy
 import os
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 7f14ed61da..f478dd0832 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for BERTweet"""
-
+"""Tokenization classes for BERTweet"""
 
 import html
 import os
diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py
index dfd36d82c3..cbcf2e6bf5 100644
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BigBird model configuration"""
+"""BigBird model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
index 34db9771b1..0b8e6590f9 100644
--- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert BigBird checkpoint."""
 
-
 import argparse
 
 from transformers import BigBirdConfig, BigBirdForPreTraining, BigBirdForQuestionAnswering, load_tf_weights_in_big_bird
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 39144c77e2..f73ab9e51f 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BigBird model."""
-
+"""PyTorch BigBird model."""
 
 import math
 import os
@@ -919,11 +918,9 @@ class BigBirdBlockSparseAttention(nn.Module):
             attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[
                 :, :, :, :to_block_size
             ]  # 1st key block (global)
-            attention_probs[
-                :, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :
-            ] = second_last_attn_weights[
-                :, :, :, to_block_size : 4 * to_block_size
-            ]  # last three blocks (global + sliding)
+            attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = (
+                second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size]
+            )  # last three blocks (global + sliding)
             # random keys
             for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights):
                 # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 58dc57ef6d..e435477ef3 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for BigBird."""
 
-
 import os
 import re
 from shutil import copyfile
diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
index fa37cd4ac7..f4ccbb8b17 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for Big Bird model."""
-
+"""Tokenization classes for Big Bird model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
index c548573f32..9de2a7267a 100644
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BigBirdPegasus model configuration"""
+"""BigBirdPegasus model configuration"""
 
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index b4e6419f99..d1ba54213a 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BigBirdPegasus model."""
+"""PyTorch BigBirdPegasus model."""
 
 import copy
 import math
@@ -717,11 +717,9 @@ class BigBirdPegasusBlockSparseAttention(nn.Module):
             attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[
                 :, :, :, :to_block_size
             ]  # 1st key block (global)
-            attention_probs[
-                :, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :
-            ] = second_last_attn_weights[
-                :, :, :, to_block_size : 4 * to_block_size
-            ]  # last three blocks (global + sliding)
+            attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = (
+                second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size]
+            )  # last three blocks (global + sliding)
             # random keys
             for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights):
                 # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
diff --git a/src/transformers/models/biogpt/configuration_biogpt.py b/src/transformers/models/biogpt/configuration_biogpt.py
index 936fee7632..18f7b6d6bf 100644
--- a/src/transformers/models/biogpt/configuration_biogpt.py
+++ b/src/transformers/models/biogpt/configuration_biogpt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BioGPT model configuration"""
+"""BioGPT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index ad1a5df715..ae46a6ff07 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BioGPT model."""
-
+"""PyTorch BioGPT model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/biogpt/tokenization_biogpt.py b/src/transformers/models/biogpt/tokenization_biogpt.py
index e16742ec5a..f9760eb604 100644
--- a/src/transformers/models/biogpt/tokenization_biogpt.py
+++ b/src/transformers/models/biogpt/tokenization_biogpt.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for BioGPT."""
+
 import json
 import os
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index f1532a74b9..8f4326a2d5 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BiT model configuration"""
+"""BiT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 7cc7f64107..abc24290ab 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert BiT checkpoints from the timm library."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/bit/modeling_bit.py b/src/transformers/models/bit/modeling_bit.py
index af96150dc9..d015db4956 100644
--- a/src/transformers/models/bit/modeling_bit.py
+++ b/src/transformers/models/bit/modeling_bit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BiT model. Also supports backbone for ViT hybrid."""
+"""PyTorch BiT model. Also supports backbone for ViT hybrid."""
 
 import collections
 import math
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index 6b9a12e02e..105d38c255 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Blenderbot model configuration"""
+"""Blenderbot model configuration"""
 
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 6fc86bcec4..12d259fde7 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Blenderbot model."""
-
+"""PyTorch Blenderbot model."""
 
 import copy
 import math
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index 61239335be..97c9653da3 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax Blenderbot model."""
+"""Flax Blenderbot model."""
 
 import math
 import random
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index ccb07d20ec..bbfe4726de 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Blenderbot model."""
-
+"""TF 2.0 Blenderbot model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
index 3a5206cdf4..01cbf13809 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Tokenization class for Blenderbot."""
+
 import json
 from typing import List, Optional, Tuple
 
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index 667db5bd55..6ee26365de 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BlenderbotSmall model configuration"""
+"""BlenderbotSmall model configuration"""
 
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 504f073ed0..aa0e38bd8e 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BlenderbotSmall model."""
-
+"""PyTorch BlenderbotSmall model."""
 
 import copy
 import math
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index b5272fb3bc..325ff0a20b 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax BlenderbotSmall model."""
-
+"""Flax BlenderbotSmall model."""
 
 import math
 import random
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 01206831ac..1576462979 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 BlenderbotSmall model."""
-
+"""TF 2.0 BlenderbotSmall model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
index 1f647d2430..a80acdb650 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast tokenization class for BlenderbotSmall."""
+
 from typing import List, Optional
 
 from tokenizers import ByteLevelBPETokenizer
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index 1a6fe37aa4..1131d598e0 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Blip model configuration"""
+"""Blip model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 576d4dd5c0..371affa5ac 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BLIP model."""
+"""PyTorch BLIP model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index 5312cf2323..1557677eb3 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow BLIP model."""
+"""TensorFlow BLIP model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index 70dea87d35..14346d5299 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BLIP-2 model configuration"""
+"""BLIP-2 model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 8986eabc7d..8fa55d01ee 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch BLIP-2 model."""
+"""PyTorch BLIP-2 model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/bloom/configuration_bloom.py b/src/transformers/models/bloom/configuration_bloom.py
index ddea3f720a..dc9f6d3082 100644
--- a/src/transformers/models/bloom/configuration_bloom.py
+++ b/src/transformers/models/bloom/configuration_bloom.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Bloom configuration"""
+"""Bloom configuration"""
+
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, List, Mapping, Optional
 
diff --git a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
index eda9a2d815..40ba6240d3 100644
--- a/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert BigScience BLOOM checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py
index afd3bfaaee..d0da1621d4 100644
--- a/src/transformers/models/bloom/tokenization_bloom_fast.py
+++ b/src/transformers/models/bloom/tokenization_bloom_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for Bloom."""
 
-
 import pickle
 from typing import Optional, Tuple
 
diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py
index 8513ce21f7..4985b6ef89 100644
--- a/src/transformers/models/bridgetower/configuration_bridgetower.py
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BridgeTower model configuration"""
+"""BridgeTower model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/bros/configuration_bros.py b/src/transformers/models/bros/configuration_bros.py
index 6a1ef6d948..8c2a3cc73a 100644
--- a/src/transformers/models/bros/configuration_bros.py
+++ b/src/transformers/models/bros/configuration_bros.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Bros model configuration"""
+"""Bros model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py
index 0f80cec404..c062278309 100755
--- a/src/transformers/models/bros/modeling_bros.py
+++ b/src/transformers/models/bros/modeling_bros.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Bros model."""
-
+"""PyTorch Bros model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
index 7d9a20f3b0..9b1b15857c 100755
--- a/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert T5 checkpoint."""
 
-
 import argparse
 
 from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py
index 68c70db0d1..21513ab4cd 100644
--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model ByT5."""
-
+"""Tokenization class for model ByT5."""
 
 import warnings
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py
index 124d14abec..b573801200 100644
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CamemBERT configuration"""
+"""CamemBERT configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index 9e66f12468..f5ddc2242b 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 CamemBERT model."""
-
+"""TF 2.0 CamemBERT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index 51d70b198b..113fe1b121 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for Camembert model."""
-
+"""Tokenization classes for Camembert model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index d1f0db688a..ffec8d98e1 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Fast tokenization classes for Camembert model."""
-
+"""Fast tokenization classes for Camembert model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/canine/configuration_canine.py b/src/transformers/models/canine/configuration_canine.py
index e3d2d1373b..9add399112 100644
--- a/src/transformers/models/canine/configuration_canine.py
+++ b/src/transformers/models/canine/configuration_canine.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CANINE model configuration"""
+"""CANINE model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
index 5d50050d03..45dcdb2903 100644
--- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert CANINE checkpoint."""
 
-
 import argparse
 
 from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index fda0ae72e6..c48559497a 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CANINE model."""
-
+"""PyTorch CANINE model."""
 
 import copy
 import math
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index 0cd73f67f2..bedda2b71a 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Chinese-CLIP model configuration"""
+"""Chinese-CLIP model configuration"""
 
 import os
 from collections import OrderedDict
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 573a39fb1c..801969c465 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Chinese-CLIP model."""
-
+"""PyTorch Chinese-CLIP model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py
index 0a36402249..8a3826779f 100644
--- a/src/transformers/models/clap/configuration_clap.py
+++ b/src/transformers/models/clap/configuration_clap.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CLAP model configuration"""
+"""CLAP model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/clap/feature_extraction_clap.py b/src/transformers/models/clap/feature_extraction_clap.py
index ce18fedd19..2d1f16e194 100644
--- a/src/transformers/models/clap/feature_extraction_clap.py
+++ b/src/transformers/models/clap/feature_extraction_clap.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Feature extractor class for CLAP."""
 
-
 import copy
 from typing import Any, Dict, List, Optional, Union
 
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index 5b86ca657b..d97d36c154 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CLAP model."""
+"""PyTorch CLAP model."""
+
 import collections
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 827fe31d5b..34fcef0673 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CLIP model configuration"""
+"""CLIP model configuration"""
 
 import os
 from collections import OrderedDict
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index c2fc3424a9..ca2de67176 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CLIP model."""
-
+"""PyTorch CLIP model."""
 
 from dataclasses import dataclass
 from typing import Any, Optional, Tuple, Union
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 142141fdc4..b728da52c2 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 CLIP model."""
-
+"""TF 2.0 CLIP model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py
index 6198958a03..fe5badbc54 100644
--- a/src/transformers/models/clip/tokenization_clip_fast.py
+++ b/src/transformers/models/clip/tokenization_clip_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-
 from typing import List, Optional, Tuple
 
 from tokenizers import pre_tokenizers
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 7df10bfe8c..df15c72076 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CLIPSeg model configuration"""
+"""CLIPSeg model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
index 00dcecff2d..24d4b2322e 100644
--- a/src/transformers/models/clipseg/modeling_clipseg.py
+++ b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CLIPSeg model."""
+"""PyTorch CLIPSeg model."""
 
 import copy
 import math
diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py
index 505238b2a8..d40ef585aa 100644
--- a/src/transformers/models/clvp/configuration_clvp.py
+++ b/src/transformers/models/clvp/configuration_clvp.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CLVP model configuration"""
-
+"""CLVP model configuration"""
 
 import os
 from typing import TYPE_CHECKING, Union
diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py
index a36e982242..3a70d68057 100644
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" PyTorch CLVP model."""
-
+"""PyTorch CLVP model."""
 
 import copy
 import math
diff --git a/src/transformers/models/clvp/number_normalizer.py b/src/transformers/models/clvp/number_normalizer.py
index 86aa087e81..7824009727 100644
--- a/src/transformers/models/clvp/number_normalizer.py
+++ b/src/transformers/models/clvp/number_normalizer.py
@@ -15,7 +15,6 @@
 
 """English Normalizer class for CLVP."""
 
-
 import re
 
 
diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py
index 0723986db9..4e015cea1f 100644
--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@@ -17,7 +17,6 @@
 Processor class for CLVP
 """
 
-
 from ...processing_utils import ProcessorMixin
 
 
diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py
index c3cbc44cea..5bbf2d0452 100644
--- a/src/transformers/models/code_llama/tokenization_code_llama.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 """Tokenization classes for Code LLaMA."""
+
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/src/transformers/models/codegen/configuration_codegen.py b/src/transformers/models/codegen/configuration_codegen.py
index db0008a033..cf69001480 100644
--- a/src/transformers/models/codegen/configuration_codegen.py
+++ b/src/transformers/models/codegen/configuration_codegen.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CodeGen model configuration"""
+"""CodeGen model configuration"""
+
 from collections import OrderedDict
 from typing import Any, List, Mapping, Optional
 
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 0987bbc305..a8df9ed7f3 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CodeGen model."""
+"""PyTorch CodeGen model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py
index 1b03af7008..f3f765d273 100644
--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for CodeGen"""
 
-
 import json
 import os
 from functools import lru_cache
diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py
index b086fb84a6..9fdf2ec38e 100644
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-
 import json
 import re
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py
index f0b0e50b91..4060811260 100644
--- a/src/transformers/models/cohere/configuration_cohere.py
+++ b/src/transformers/models/cohere/configuration_cohere.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Cohere model configuration"""
+"""Cohere model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index e5dc3930d3..cf7e5834b0 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Conditional DETR model configuration"""
+"""Conditional DETR model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index b1a1b1c817..91f00668be 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Conditional DETR checkpoints."""
 
-
 import argparse
 import json
 from collections import OrderedDict
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index 7fd04b8b43..aa905d9e96 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Conditional DETR model."""
-
+"""PyTorch Conditional DETR model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/convbert/configuration_convbert.py b/src/transformers/models/convbert/configuration_convbert.py
index 82d555cd3a..2c6b544568 100644
--- a/src/transformers/models/convbert/configuration_convbert.py
+++ b/src/transformers/models/convbert/configuration_convbert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ConvBERT model configuration"""
+"""ConvBERT model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py
index dd5d06ef92..b92ff686ed 100755
--- a/src/transformers/models/convbert/modeling_convbert.py
+++ b/src/transformers/models/convbert/modeling_convbert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ConvBERT model."""
-
+"""PyTorch ConvBERT model."""
 
 import math
 import os
@@ -81,72 +80,72 @@ def load_tf_weights_in_convbert(model, config, tf_checkpoint_path):
         group_dense_name = "dense"
 
     for j in range(config.num_hidden_layers):
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.query.weight"
-        ] = f"electra/encoder/layer_{j}/attention/self/query/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.query.bias"
-        ] = f"electra/encoder/layer_{j}/attention/self/query/bias"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.key.weight"
-        ] = f"electra/encoder/layer_{j}/attention/self/key/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.key.bias"
-        ] = f"electra/encoder/layer_{j}/attention/self/key/bias"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.value.weight"
-        ] = f"electra/encoder/layer_{j}/attention/self/value/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.value.bias"
-        ] = f"electra/encoder/layer_{j}/attention/self/value/bias"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.key_conv_attn_layer.depthwise.weight"
-        ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/depthwise_kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.key_conv_attn_layer.pointwise.weight"
-        ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/pointwise_kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.key_conv_attn_layer.bias"
-        ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_key/bias"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.conv_kernel_layer.weight"
-        ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.conv_kernel_layer.bias"
-        ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/bias"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.conv_out_layer.weight"
-        ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_point/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.self.conv_out_layer.bias"
-        ] = f"electra/encoder/layer_{j}/attention/self/conv_attn_point/bias"
-        param_mapping[
-            f"encoder.layer.{j}.attention.output.dense.weight"
-        ] = f"electra/encoder/layer_{j}/attention/output/dense/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.attention.output.LayerNorm.weight"
-        ] = f"electra/encoder/layer_{j}/attention/output/LayerNorm/gamma"
-        param_mapping[
-            f"encoder.layer.{j}.attention.output.dense.bias"
-        ] = f"electra/encoder/layer_{j}/attention/output/dense/bias"
-        param_mapping[
-            f"encoder.layer.{j}.attention.output.LayerNorm.bias"
-        ] = f"electra/encoder/layer_{j}/attention/output/LayerNorm/beta"
-        param_mapping[
-            f"encoder.layer.{j}.intermediate.dense.weight"
-        ] = f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.intermediate.dense.bias"
-        ] = f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/bias"
-        param_mapping[
-            f"encoder.layer.{j}.output.dense.weight"
-        ] = f"electra/encoder/layer_{j}/output/{group_dense_name}/kernel"
-        param_mapping[
-            f"encoder.layer.{j}.output.dense.bias"
-        ] = f"electra/encoder/layer_{j}/output/{group_dense_name}/bias"
-        param_mapping[
-            f"encoder.layer.{j}.output.LayerNorm.weight"
-        ] = f"electra/encoder/layer_{j}/output/LayerNorm/gamma"
+        param_mapping[f"encoder.layer.{j}.attention.self.query.weight"] = (
+            f"electra/encoder/layer_{j}/attention/self/query/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.query.bias"] = (
+            f"electra/encoder/layer_{j}/attention/self/query/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.key.weight"] = (
+            f"electra/encoder/layer_{j}/attention/self/key/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.key.bias"] = (
+            f"electra/encoder/layer_{j}/attention/self/key/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.value.weight"] = (
+            f"electra/encoder/layer_{j}/attention/self/value/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.value.bias"] = (
+            f"electra/encoder/layer_{j}/attention/self/value/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.depthwise.weight"] = (
+            f"electra/encoder/layer_{j}/attention/self/conv_attn_key/depthwise_kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.pointwise.weight"] = (
+            f"electra/encoder/layer_{j}/attention/self/conv_attn_key/pointwise_kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.key_conv_attn_layer.bias"] = (
+            f"electra/encoder/layer_{j}/attention/self/conv_attn_key/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.conv_kernel_layer.weight"] = (
+            f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.conv_kernel_layer.bias"] = (
+            f"electra/encoder/layer_{j}/attention/self/conv_attn_kernel/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.conv_out_layer.weight"] = (
+            f"electra/encoder/layer_{j}/attention/self/conv_attn_point/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.self.conv_out_layer.bias"] = (
+            f"electra/encoder/layer_{j}/attention/self/conv_attn_point/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.output.dense.weight"] = (
+            f"electra/encoder/layer_{j}/attention/output/dense/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.output.LayerNorm.weight"] = (
+            f"electra/encoder/layer_{j}/attention/output/LayerNorm/gamma"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.output.dense.bias"] = (
+            f"electra/encoder/layer_{j}/attention/output/dense/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.attention.output.LayerNorm.bias"] = (
+            f"electra/encoder/layer_{j}/attention/output/LayerNorm/beta"
+        )
+        param_mapping[f"encoder.layer.{j}.intermediate.dense.weight"] = (
+            f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.intermediate.dense.bias"] = (
+            f"electra/encoder/layer_{j}/intermediate/{group_dense_name}/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.output.dense.weight"] = (
+            f"electra/encoder/layer_{j}/output/{group_dense_name}/kernel"
+        )
+        param_mapping[f"encoder.layer.{j}.output.dense.bias"] = (
+            f"electra/encoder/layer_{j}/output/{group_dense_name}/bias"
+        )
+        param_mapping[f"encoder.layer.{j}.output.LayerNorm.weight"] = (
+            f"electra/encoder/layer_{j}/output/LayerNorm/gamma"
+        )
         param_mapping[f"encoder.layer.{j}.output.LayerNorm.bias"] = f"electra/encoder/layer_{j}/output/LayerNorm/beta"
 
     for param in model.named_parameters():
diff --git a/src/transformers/models/convbert/modeling_tf_convbert.py b/src/transformers/models/convbert/modeling_tf_convbert.py
index a8ac11a8cd..95be5a56e1 100644
--- a/src/transformers/models/convbert/modeling_tf_convbert.py
+++ b/src/transformers/models/convbert/modeling_tf_convbert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ConvBERT model."""
-
+"""TF 2.0 ConvBERT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index c0fe2c0183..f1bc98bf41 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for ConvBERT."""
+
 import collections
 import os
 import unicodedata
diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py
index 65bedb73fe..e9c47c2b04 100644
--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ b/src/transformers/models/convbert/tokenization_convbert_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for ConvBERT."""
+
 import json
 from typing import List, Optional, Tuple
 
diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py
index 2549f06b99..291faa4e1a 100644
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ConvNeXT model configuration"""
+"""ConvNeXT model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
index cdcbf24d55..27315ed73f 100644
--- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py
+++ b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
@@ -16,7 +16,6 @@
 
 URL: https://github.com/facebookresearch/ConvNeXt"""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index 19eb1b599c..a0deaf96d5 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ConvNext model."""
-
+"""PyTorch ConvNext model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/convnext/modeling_tf_convnext.py b/src/transformers/models/convnext/modeling_tf_convnext.py
index b92ac446d9..0e348a838a 100644
--- a/src/transformers/models/convnext/modeling_tf_convnext.py
+++ b/src/transformers/models/convnext/modeling_tf_convnext.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ConvNext model."""
-
+"""TF 2.0 ConvNext model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/convnextv2/configuration_convnextv2.py b/src/transformers/models/convnextv2/configuration_convnextv2.py
index e7692250b2..6d5b82b531 100644
--- a/src/transformers/models/convnextv2/configuration_convnextv2.py
+++ b/src/transformers/models/convnextv2/configuration_convnextv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ConvNeXTV2 model configuration"""
-
+"""ConvNeXTV2 model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
index 3f965f9c2c..df13a5ea6b 100644
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ConvNextV2 model."""
-
+"""PyTorch ConvNextV2 model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
index 0debe6fd0c..e39aee5159 100644
--- a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ConvNextV2 model."""
-
+"""TF 2.0 ConvNextV2 model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index ac454898b5..c92afb7eb6 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes."""
+
 import os
 import unicodedata
 from shutil import copyfile
diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py
index 9b7b6da118..3dcf624843 100644
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes."""
+
 import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/cpmant/configuration_cpmant.py b/src/transformers/models/cpmant/configuration_cpmant.py
index 4c2a880866..155811913a 100644
--- a/src/transformers/models/cpmant/configuration_cpmant.py
+++ b/src/transformers/models/cpmant/configuration_cpmant.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CPMAnt model configuration"""
+"""CPMAnt model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py
index 9882d4ccc6..c8a3135052 100755
--- a/src/transformers/models/cpmant/modeling_cpmant.py
+++ b/src/transformers/models/cpmant/modeling_cpmant.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CPMAnt"""
-
+"""PyTorch CPMAnt"""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/cpmant/tokenization_cpmant.py b/src/transformers/models/cpmant/tokenization_cpmant.py
index a5e66c7679..2ccb296c70 100644
--- a/src/transformers/models/cpmant/tokenization_cpmant.py
+++ b/src/transformers/models/cpmant/tokenization_cpmant.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for CPMAnt."""
+
 import collections
 import os
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py
index 8fd01c10b5..adea61cd67 100644
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Salesforce CTRL configuration"""
+"""Salesforce CTRL configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
index c0376f673a..d84c8bb37c 100644
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CTRL model."""
+"""PyTorch CTRL model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index 86808190c8..1621cc17ca 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 CTRL model."""
+"""TF 2.0 CTRL model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index fdae22d2c3..5305f2b231 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for Salesforce CTRL."""
 
-
 import json
 import os
 from typing import Optional, Tuple
diff --git a/src/transformers/models/cvt/configuration_cvt.py b/src/transformers/models/cvt/configuration_cvt.py
index e8c50fbf77..a966701cee 100644
--- a/src/transformers/models/cvt/configuration_cvt.py
+++ b/src/transformers/models/cvt/configuration_cvt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" CvT model configuration"""
+"""CvT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
index ea4edac16c..f014365140 100644
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
@@ -16,7 +16,6 @@
 
 URL: https://github.com/microsoft/CvT"""
 
-
 import argparse
 import json
 from collections import OrderedDict
diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py
index 23728c770c..7963824444 100644
--- a/src/transformers/models/cvt/modeling_cvt.py
+++ b/src/transformers/models/cvt/modeling_cvt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch CvT model."""
-
+"""PyTorch CvT model."""
 
 import collections.abc
 from dataclasses import dataclass
diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py
index 03df5033b0..617fc99733 100644
--- a/src/transformers/models/cvt/modeling_tf_cvt.py
+++ b/src/transformers/models/cvt/modeling_tf_cvt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Cvt model."""
-
+"""TF 2.0 Cvt model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index 32d505f157..54754a8c79 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2VecText configuration"""
+"""Data2VecText configuration"""
 
 import math
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index e7b15270ed..6cd7b80c30 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2VecText configuration"""
+"""Data2VecText configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec_vision.py b/src/transformers/models/data2vec/configuration_data2vec_vision.py
index 315f24a557..d63a564cec 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_vision.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_vision.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Data2VecVision model configuration"""
+"""Data2VecVision model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
index 01c2d8cab2..b5a30223bc 100644
--- a/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Wav2Vec2 checkpoint."""
 
-
 import argparse
 import os
 from functools import reduce
diff --git a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
index 81f5cd23fb..10b97dc93d 100644
--- a/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_text_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert data2vec checkpoint."""
 
-
 import argparse
 import os
 import pathlib
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 8837c278c5..8be8b5ea8f 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Data2VecAudio model."""
+"""PyTorch Data2VecAudio model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index 12d62b78a0..03b8170e67 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Data2VecVision model."""
-
+"""PyTorch Data2VecVision model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index e65a61fae5..3939afe570 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Data2Vec Vision model."""
-
+"""TF 2.0 Data2Vec Vision model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
index b03d2c17b0..91f4fc3a4b 100644
--- a/src/transformers/models/dbrx/configuration_dbrx.py
+++ b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DBRX model configuration """
+"""DBRX model configuration"""
 
 from typing import Any, Optional
 
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index f483f79cd5..850e3a3f81 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DBRX model. """
+"""PyTorch DBRX model."""
 
 import math
 from typing import Any, Optional, Tuple, Union
diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py
index e79e7238ab..59b59764c3 100644
--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DeBERTa model configuration"""
+"""DeBERTa model configuration"""
+
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
 
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 03646b95aa..964e3add91 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa model."""
+"""PyTorch DeBERTa model."""
 
 from collections.abc import Sequence
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index 774d6296d0..6762c69ec5 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 DeBERTa model."""
-
+"""TF 2.0 DeBERTa model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index b846a78915..371aa98662 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model DeBERTa."""
+"""Tokenization class for model DeBERTa."""
 
 import json
 import os
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
index 07226443d3..b28732850b 100644
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fast Tokenization class for model DeBERTa."""
+"""Fast Tokenization class for model DeBERTa."""
 
 import json
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
index 78fdd91c81..83745980fb 100644
--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DeBERTa-v2 model configuration"""
+"""DeBERTa-v2 model configuration"""
+
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
 
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 572ee9196f..fd910e9daf 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeBERTa-v2 model."""
+"""PyTorch DeBERTa-v2 model."""
 
 from collections.abc import Sequence
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 9bd6258100..15ab6da158 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 DeBERTa-v2 model."""
+"""TF 2.0 DeBERTa-v2 model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index a921039454..2876ac7660 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model DeBERTa."""
+"""Tokenization class for model DeBERTa."""
 
 import os
 import unicodedata
diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
index 6f1fb500ba..19e89afecb 100644
--- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Decision Transformer model configuration"""
+"""Decision Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index be7928995c..236efb1d22 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DecisionTransformer model."""
+"""PyTorch DecisionTransformer model."""
 
 import math
 import os
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index b623d3504a..d888f6e6f4 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Deformable DETR model configuration"""
+"""Deformable DETR model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
index 928fa368ed..b637ba6d84 100644
--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Deformable DETR checkpoints."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/deformable_detr/load_custom.py b/src/transformers/models/deformable_detr/load_custom.py
index c3a822e276..3c0b3a432b 100644
--- a/src/transformers/models/deformable_detr/load_custom.py
+++ b/src/transformers/models/deformable_detr/load_custom.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Loading of Deformable DETR's CUDA kernels"""
+"""Loading of Deformable DETR's CUDA kernels"""
+
 import os
 from pathlib import Path
 
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index b77ae4f8f8..f619575bd8 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Deformable DETR model."""
-
+"""PyTorch Deformable DETR model."""
 
 import copy
 import math
diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py
index e1767c35fd..3784ed76ab 100644
--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DeiT model configuration"""
+"""DeiT model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
index 2b5c795ff2..e7bf3e7a12 100644
--- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+++ b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DeiT distilled checkpoints from the timm library."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index fe811ecc4a..c9e54d3b87 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DeiT model."""
-
+"""PyTorch DeiT model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/deit/modeling_tf_deit.py b/src/transformers/models/deit/modeling_tf_deit.py
index 43a3465ba1..e5faff2a4a 100644
--- a/src/transformers/models/deit/modeling_tf_deit.py
+++ b/src/transformers/models/deit/modeling_tf_deit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow DeiT model."""
-
+"""TensorFlow DeiT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
index 5dc9a244c4..e2f64e9c3c 100644
--- a/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Bort checkpoint."""
 
-
 import argparse
 import os
 
diff --git a/src/transformers/models/deprecated/mctct/modeling_mctct.py b/src/transformers/models/deprecated/mctct/modeling_mctct.py
index 95c860fa9a..931495611a 100755
--- a/src/transformers/models/deprecated/mctct/modeling_mctct.py
+++ b/src/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch M-CTC-T model."""
-
+"""PyTorch M-CTC-T model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/deprecated/mctct/processing_mctct.py b/src/transformers/models/deprecated/mctct/processing_mctct.py
index 4e0cbe27dd..e2201c0ed5 100644
--- a/src/transformers/models/deprecated/mctct/processing_mctct.py
+++ b/src/transformers/models/deprecated/mctct/processing_mctct.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for M-CTC-T
 """
+
 import warnings
 from contextlib import contextmanager
 
diff --git a/src/transformers/models/deprecated/mmbt/configuration_mmbt.py b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
index df5161b092..8fcc0f1d63 100644
--- a/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MMBT configuration"""
+"""MMBT configuration"""
 
 from ....utils import logging
 
diff --git a/src/transformers/models/deprecated/mmbt/modeling_mmbt.py b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
index 8dc450ce8f..4a06de5698 100644
--- a/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/deprecated/mmbt/modeling_mmbt.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """PyTorch MMBT model."""
 
-
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
index ae2add5a5f..259fd19367 100644
--- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Open-Llama model configuration"""
+"""Open-Llama model configuration"""
 
 from ....configuration_utils import PretrainedConfig
 from ....utils import logging
diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index 4e42d716e8..e748529c9e 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Open-Llama model."""
+"""PyTorch Open-Llama model."""
+
 import math
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/deprecated/retribert/configuration_retribert.py b/src/transformers/models/deprecated/retribert/configuration_retribert.py
index dfa7d3b65b..f154bb04c6 100644
--- a/src/transformers/models/deprecated/retribert/configuration_retribert.py
+++ b/src/transformers/models/deprecated/retribert/configuration_retribert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RetriBERT model configuration"""
+"""RetriBERT model configuration"""
 
 from ....configuration_utils import PretrainedConfig
 from ....utils import logging
diff --git a/src/transformers/models/deprecated/retribert/modeling_retribert.py b/src/transformers/models/deprecated/retribert/modeling_retribert.py
index d8af23fb49..3af3f7be49 100644
--- a/src/transformers/models/deprecated/retribert/modeling_retribert.py
+++ b/src/transformers/models/deprecated/retribert/modeling_retribert.py
@@ -16,7 +16,6 @@
 RetriBERT model
 """
 
-
 import math
 from typing import Optional
 
diff --git a/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
index 06ec12161f..6ce86dfb7a 100644
--- a/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TrajectoryTransformer model configuration"""
+"""TrajectoryTransformer model configuration"""
 
 from ....configuration_utils import PretrainedConfig
 from ....utils import logging
diff --git a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
index 622552fa78..da7f780667 100644
--- a/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TrajectoryTransformer pytorch checkpoint conversion"""
+"""TrajectoryTransformer pytorch checkpoint conversion"""
 
 import torch
 import trajectory.utils as utils
diff --git a/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
index 24e1815218..5bb787b87d 100644
--- a/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch TrajectoryTransformer model."""
+"""PyTorch TrajectoryTransformer model."""
 
 import math
 import os
diff --git a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
index c9dde2e8fd..5cd031649f 100644
--- a/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Transformer XL configuration"""
+"""Transformer XL configuration"""
 
 from ....configuration_utils import PretrainedConfig
 from ....utils import logging
diff --git a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index d2693ac333..cf191f797f 100644
--- a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Transformer XL checkpoint and datasets."""
 
-
 import argparse
 import os
 import pickle
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
index e490eb8dba..2eb3fe4893 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- TF 2.0 Transformer XL model.
+TF 2.0 Transformer XL model.
 """
 
 from __future__ import annotations
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
index ed1488d559..48205e06fb 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- A TF 2.0 Adaptive Softmax for Transformer XL model.
+A TF 2.0 Adaptive Softmax for Transformer XL model.
 """
 
-
 import tensorflow as tf
 
 from ....modeling_tf_utils import keras
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
index 52989cfd13..84b60dbf6d 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@@ -14,9 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
- https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
+PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
+https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
 """
+
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
index addf2a0837..f76f3ccc62 100644
--- a/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
+Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
 """
 
-
 import torch
 from torch import nn
 
diff --git a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
index 7290a7a83b..4229e8e5b3 100644
--- a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl.
+Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl.
 """
 
-
 import glob
 import os
 import pickle
diff --git a/src/transformers/models/deprecated/van/configuration_van.py b/src/transformers/models/deprecated/van/configuration_van.py
index 68a139ffdf..2935e631e0 100644
--- a/src/transformers/models/deprecated/van/configuration_van.py
+++ b/src/transformers/models/deprecated/van/configuration_van.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VAN model configuration"""
+"""VAN model configuration"""
 
 from ....configuration_utils import PretrainedConfig
 from ....utils import logging
diff --git a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
index 20492e42be..51466e77ba 100644
--- a/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/deprecated/van/convert_van_to_pytorch.py
@@ -16,7 +16,6 @@
 
 URL: https://github.com/Visual-Attention-Network/VAN-Classification"""
 
-
 import argparse
 import json
 import sys
diff --git a/src/transformers/models/deprecated/van/modeling_van.py b/src/transformers/models/deprecated/van/modeling_van.py
index 0f5940707c..1b26d8892b 100644
--- a/src/transformers/models/deprecated/van/modeling_van.py
+++ b/src/transformers/models/deprecated/van/modeling_van.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Visual Attention Network (VAN) model."""
+"""PyTorch Visual Attention Network (VAN) model."""
 
 import math
 from collections import OrderedDict
diff --git a/src/transformers/models/depth_anything/configuration_depth_anything.py b/src/transformers/models/depth_anything/configuration_depth_anything.py
index b6d6f388b0..9f38434abc 100644
--- a/src/transformers/models/depth_anything/configuration_depth_anything.py
+++ b/src/transformers/models/depth_anything/configuration_depth_anything.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DepthAnything model configuration"""
+"""DepthAnything model configuration"""
 
 import copy
 
diff --git a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
index 022a66c0d6..9b9836e852 100644
--- a/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
+++ b/src/transformers/models/depth_anything/convert_depth_anything_to_hf.py
@@ -15,7 +15,6 @@
 """Convert Depth Anything checkpoints from the original repository. URL:
 https://github.com/LiheYoung/Depth-Anything"""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py
index 9780278a6f..3fb2b32f79 100644
--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Depth Anything model."""
-
+"""PyTorch Depth Anything model."""
 
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index b876e843c0..d7fe7eadc7 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DETA model configuration"""
-
+"""DETA model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
index cc17568bd6..870c56f838 100644
--- a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
+++ b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
@@ -16,7 +16,6 @@
 
 URL: https://github.com/jozhang97/DETA/tree/master"""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
index 911bc434e1..67052edce1 100644
--- a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
+++ b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
@@ -16,7 +16,6 @@
 
 URL: https://github.com/jozhang97/DETA/tree/master"""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index bd2faad271..fcd8fd82b6 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DETA model."""
-
+"""PyTorch DETA model."""
 
 import copy
 import math
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 094859bda5..5e8c3b1fd8 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DETR model configuration"""
+"""DETR model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
index 72de2be870..ba98514501 100644
--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DETR checkpoints with timm backbone."""
 
-
 import argparse
 import json
 from collections import OrderedDict
diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py
index a52e592b94..6ba6a0e292 100644
--- a/src/transformers/models/detr/convert_detr_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DETR checkpoints with native (Transformers) backbone."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 37da6b6ee5..ff8b1416b0 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DETR model."""
-
+"""PyTorch DETR model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py
index 6138e8072b..220561152b 100644
--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Dilated Neighborhood Attention Transformer model configuration"""
+"""Dilated Neighborhood Attention Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index 03c0fe33a2..95d6a60183 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Dilated Neighborhood Attention Transformer model."""
-
+"""PyTorch Dilated Neighborhood Attention Transformer model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/dinov2/configuration_dinov2.py b/src/transformers/models/dinov2/configuration_dinov2.py
index 48feba23b1..2df883de16 100644
--- a/src/transformers/models/dinov2/configuration_dinov2.py
+++ b/src/transformers/models/dinov2/configuration_dinov2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DINOv2 model configuration"""
+"""DINOv2 model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/dinov2/convert_dinov2_to_hf.py b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
index dd5871e6c4..5583413eb7 100644
--- a/src/transformers/models/dinov2/convert_dinov2_to_hf.py
+++ b/src/transformers/models/dinov2/convert_dinov2_to_hf.py
@@ -17,7 +17,6 @@
 URL: https://github.com/facebookresearch/dinov2/tree/main
 """
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index 505c1b5c06..3a7959c27d 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DINOv2 model."""
-
+"""PyTorch DINOv2 model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py
index 45e2d3b3f0..a2ce1a2419 100644
--- a/src/transformers/models/distilbert/configuration_distilbert.py
+++ b/src/transformers/models/distilbert/configuration_distilbert.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DistilBERT model configuration"""
+"""DistilBERT model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index e8357682ff..8c65a4b215 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -14,11 +14,10 @@
 # limitations under the License.
 
 """
- PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
- part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
+PyTorch DistilBERT model adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) and in
+part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
 """
 
-
 import math
 from typing import Dict, List, Optional, Set, Tuple, Union
 
diff --git a/src/transformers/models/distilbert/modeling_tf_distilbert.py b/src/transformers/models/distilbert/modeling_tf_distilbert.py
index 1e663c9229..87dab93ca1 100644
--- a/src/transformers/models/distilbert/modeling_tf_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_tf_distilbert.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- TF 2.0 DistilBERT model
+TF 2.0 DistilBERT model
 """
 
-
 from __future__ import annotations
 
 import warnings
diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
index c754b9bbf3..40c5b22e3b 100644
--- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+++ b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DiT checkpoints from the unilm repository."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index 80418e7144..b9f9fae39c 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Donut Swin Transformer model configuration"""
+"""Donut Swin Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 13f669ad97..913bf2b64b 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -106,22 +106,22 @@ def convert_state_dict(orig_state_dict, model):
                 orig_state_dict[
                     f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                 ] = val[:dim, :]
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
-                ] = val[dim : dim * 2, :]
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
+                    val[dim : dim * 2, :]
+                )
                 orig_state_dict[
                     f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                 ] = val[-dim:, :]
             else:
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
-                ] = val[:dim]
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
-                ] = val[dim : dim * 2]
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
-                ] = val[-dim:]
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
+                    val[:dim]
+                )
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = (
+                    val[dim : dim * 2]
+                )
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
+                    val[-dim:]
+                )
         elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
             # HuggingFace implementation doesn't use attn_mask buffer
             # and model doesn't use final LayerNorms for the encoder
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 5b8a0e27b0..4775d00c19 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Donut Swin Transformer model.
+"""PyTorch Donut Swin Transformer model.
 
 This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden
 states."""
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index 1f03fd6306..daf6e7d1df 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -15,6 +15,7 @@
 """
 Processor class for Donut.
 """
+
 import re
 import warnings
 from contextlib import contextmanager
diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py
index 3d7abec3eb..b22da23ca4 100644
--- a/src/transformers/models/dpr/configuration_dpr.py
+++ b/src/transformers/models/dpr/configuration_dpr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DPR model configuration"""
+"""DPR model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index 2094440a7f..7ba63f134c 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DPR model for Open Domain Question Answering."""
-
+"""PyTorch DPR model for Open Domain Question Answering."""
 
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index a2e539f73f..92a0e54cbb 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" TensorFlow DPR model for Open Domain Question Answering."""
+"""TensorFlow DPR model for Open Domain Question Answering."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index 1362047ce2..45ce73425f 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for DPR."""
 
-
 import collections
 from typing import List, Optional, Union
 
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 730f200a68..69ac58a77d 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for DPR."""
 
-
 import collections
 from typing import List, Optional, Union
 
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index b21864e9b0..7ae48f7519 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" DPT model configuration"""
+"""DPT model configuration"""
 
 import copy
 
diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
index 2bd147096c..7b3715bddf 100644
--- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
+++ b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py
@@ -15,7 +15,6 @@
 """Convert DINOv2 + DPT checkpoints from the original repository. URL:
 https://github.com/facebookresearch/dinov2/tree/main"""
 
-
 import argparse
 import itertools
 import math
diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
index eb335a0ea2..3a576d772f 100644
--- a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
+++ b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS"""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
index 0fa69adfaf..1304acaafc 100644
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT"""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py b/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
index fd6522ab6b..0feebe72d4 100644
--- a/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
+++ b/src/transformers/models/dpt/convert_dpt_swinv2_to_hf.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS"""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
index 42637cb215..b55c96f0c7 100644
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert DPT checkpoints from the original repository. URL: https://github.com/isl-org/DPT"""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index 5240b651cc..a15c9caca2 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -12,14 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch DPT (Dense Prediction Transformers) model.
+"""PyTorch DPT (Dense Prediction Transformers) model.
 
 This implementation is heavily inspired by OpenMMLab's implementation, found here:
 https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.
 
 """
 
-
 import collections.abc
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py
index 0cc7635ea0..a9fbfa7e05 100644
--- a/src/transformers/models/efficientformer/configuration_efficientformer.py
+++ b/src/transformers/models/efficientformer/configuration_efficientformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" EfficientFormer model configuration"""
+"""EfficientFormer model configuration"""
 
 from typing import List
 
diff --git a/src/transformers/models/efficientformer/modeling_efficientformer.py b/src/transformers/models/efficientformer/modeling_efficientformer.py
index 6d79fe5bae..44d2adbed4 100644
--- a/src/transformers/models/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_efficientformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch EfficientFormer model."""
+"""PyTorch EfficientFormer model."""
 
 import itertools
 from dataclasses import dataclass
diff --git a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
index 605487e6fe..a8ce9dd306 100644
--- a/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
+++ b/src/transformers/models/efficientformer/modeling_tf_efficientformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow EfficientFormer model."""
+"""TensorFlow EfficientFormer model."""
 
 import itertools
 from dataclasses import dataclass
diff --git a/src/transformers/models/efficientnet/configuration_efficientnet.py b/src/transformers/models/efficientnet/configuration_efficientnet.py
index 63480a9a6d..4c7feb377f 100644
--- a/src/transformers/models/efficientnet/configuration_efficientnet.py
+++ b/src/transformers/models/efficientnet/configuration_efficientnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" EfficientNet model configuration"""
+"""EfficientNet model configuration"""
 
 from collections import OrderedDict
 from typing import List, Mapping
diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py
index b047e4f89f..057cd42f2a 100644
--- a/src/transformers/models/efficientnet/modeling_efficientnet.py
+++ b/src/transformers/models/efficientnet/modeling_efficientnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch EfficientNet model."""
-
+"""PyTorch EfficientNet model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py
index f4ca304983..17be728ed6 100644
--- a/src/transformers/models/electra/configuration_electra.py
+++ b/src/transformers/models/electra/configuration_electra.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ELECTRA model configuration"""
+"""ELECTRA model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
index d5d6376d7b..b0abc30cd7 100644
--- a/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/electra/convert_electra_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ELECTRA checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index b903e7f082..a289bb9728 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF Electra model."""
-
+"""TF Electra model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/encodec/configuration_encodec.py b/src/transformers/models/encodec/configuration_encodec.py
index 4d8611a178..bc10e8ffc3 100644
--- a/src/transformers/models/encodec/configuration_encodec.py
+++ b/src/transformers/models/encodec/configuration_encodec.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" EnCodec model configuration"""
-
+"""EnCodec model configuration"""
 
 import math
 from typing import Optional
diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py
index 0df2a2a6ca..9627742b9e 100644
--- a/src/transformers/models/encodec/modeling_encodec.py
+++ b/src/transformers/models/encodec/modeling_encodec.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch EnCodec model."""
+"""PyTorch EnCodec model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 2b185cc14a..b568850060 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support Encoder-Decoder architectures"""
-
+"""Classes to support Encoder-Decoder architectures"""
 
 import gc
 import inspect
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index beecd08032..24b053969c 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support Flax Encoder-Decoder architectures"""
-
+"""Classes to support Flax Encoder-Decoder architectures"""
 
 import os
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 855fb767d1..85802b77f3 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support TF Encoder-Decoder architectures"""
-
+"""Classes to support TF Encoder-Decoder architectures"""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py
index 16f1448990..808a0c2722 100644
--- a/src/transformers/models/ernie/configuration_ernie.py
+++ b/src/transformers/models/ernie/configuration_ernie.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ERNIE model configuration"""
+"""ERNIE model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index 4c2dc767f8..298465b6c9 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch ERNIE model."""
 
-
 import math
 import warnings
 from dataclasses import dataclass
diff --git a/src/transformers/models/ernie_m/configuration_ernie_m.py b/src/transformers/models/ernie_m/configuration_ernie_m.py
index 9c67a86421..cf34a510f2 100644
--- a/src/transformers/models/ernie_m/configuration_ernie_m.py
+++ b/src/transformers/models/ernie_m/configuration_ernie_m.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ErnieM model configuration"""
+"""ErnieM model configuration"""
 # Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
 
 from __future__ import annotations
diff --git a/src/transformers/models/ernie_m/modeling_ernie_m.py b/src/transformers/models/ernie_m/modeling_ernie_m.py
index b8f7e87c85..6b977801fe 100755
--- a/src/transformers/models/ernie_m/modeling_ernie_m.py
+++ b/src/transformers/models/ernie_m/modeling_ernie_m.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ErnieM model."""
-
+"""PyTorch ErnieM model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py
index feb06031e5..9634a20015 100644
--- a/src/transformers/models/esm/configuration_esm.py
+++ b/src/transformers/models/esm/configuration_esm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ESM model configuration"""
+"""ESM model configuration"""
 
 from dataclasses import asdict, dataclass
 from typing import Optional
diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py
index 22ca3f5392..020dd4e576 100644
--- a/src/transformers/models/esm/convert_esm.py
+++ b/src/transformers/models/esm/convert_esm.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ESM checkpoint."""
 
-
 import argparse
 import pathlib
 from pathlib import Path
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index e5683a3c99..08819b7f77 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ESM model."""
+"""PyTorch ESM model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index 2688c207b0..7cb673103d 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ESM model."""
-
+"""PyTorch ESM model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/esm/openfold_utils/feats.py b/src/transformers/models/esm/openfold_utils/feats.py
index 18b01a1fec..ac7b90dfe7 100644
--- a/src/transformers/models/esm/openfold_utils/feats.py
+++ b/src/transformers/models/esm/openfold_utils/feats.py
@@ -25,15 +25,13 @@ from .tensor_utils import batched_gather
 
 
 @overload
-def pseudo_beta_fn(aatype: torch.Tensor, all_atom_positions: torch.Tensor, all_atom_masks: None) -> torch.Tensor:
-    ...
+def pseudo_beta_fn(aatype: torch.Tensor, all_atom_positions: torch.Tensor, all_atom_masks: None) -> torch.Tensor: ...
 
 
 @overload
 def pseudo_beta_fn(
     aatype: torch.Tensor, all_atom_positions: torch.Tensor, all_atom_masks: torch.Tensor
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    ...
+) -> Tuple[torch.Tensor, torch.Tensor]: ...
 
 
 def pseudo_beta_fn(aatype, all_atom_positions, all_atom_masks):
diff --git a/src/transformers/models/esm/openfold_utils/protein.py b/src/transformers/models/esm/openfold_utils/protein.py
index 32e0157171..ae9d8c1327 100644
--- a/src/transformers/models/esm/openfold_utils/protein.py
+++ b/src/transformers/models/esm/openfold_utils/protein.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 """Protein data type."""
+
 import dataclasses
 import re
 import string
diff --git a/src/transformers/models/esm/openfold_utils/tensor_utils.py b/src/transformers/models/esm/openfold_utils/tensor_utils.py
index 99dd6dbe47..20ee34b236 100644
--- a/src/transformers/models/esm/openfold_utils/tensor_utils.py
+++ b/src/transformers/models/esm/openfold_utils/tensor_utils.py
@@ -108,23 +108,19 @@ def dict_map(
 
 
 @overload
-def tree_map(fn: Callable[[T], Any], tree: T, leaf_type: Type[T]) -> Any:
-    ...
+def tree_map(fn: Callable[[T], Any], tree: T, leaf_type: Type[T]) -> Any: ...
 
 
 @overload
-def tree_map(fn: Callable[[T], Any], tree: dict, leaf_type: Type[T]) -> dict:
-    ...
+def tree_map(fn: Callable[[T], Any], tree: dict, leaf_type: Type[T]) -> dict: ...
 
 
 @overload
-def tree_map(fn: Callable[[T], Any], tree: list, leaf_type: Type[T]) -> list:
-    ...
+def tree_map(fn: Callable[[T], Any], tree: list, leaf_type: Type[T]) -> list: ...
 
 
 @overload
-def tree_map(fn: Callable[[T], Any], tree: tuple, leaf_type: Type[T]) -> tuple:
-    ...
+def tree_map(fn: Callable[[T], Any], tree: tuple, leaf_type: Type[T]) -> tuple: ...
 
 
 def tree_map(fn, tree, leaf_type):
diff --git a/src/transformers/models/esm/tokenization_esm.py b/src/transformers/models/esm/tokenization_esm.py
index 27a889c87e..fbb759c1d1 100644
--- a/src/transformers/models/esm/tokenization_esm.py
+++ b/src/transformers/models/esm/tokenization_esm.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for ESM."""
+
 import os
 from typing import List, Optional
 
diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
index b8e1ae8512..ade5b8b266 100644
--- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" FastSpeech2Conformer model configuration"""
+"""FastSpeech2Conformer model configuration"""
 
 from typing import Dict
 
diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
index 01c2cece55..e97e276b18 100644
--- a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch FastSpeech2Conformer model."""
+"""PyTorch FastSpeech2Conformer model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
index 5b979c8761..65c081c4c1 100644
--- a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for FastSpeech2Conformer."""
+
 import json
 import os
 from typing import Optional, Tuple
diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py
index 98939e7091..ae5e07245e 100644
--- a/src/transformers/models/flaubert/configuration_flaubert.py
+++ b/src/transformers/models/flaubert/configuration_flaubert.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flaubert configuration"""
+"""Flaubert configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 86db691557..50c6f7ede2 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Flaubert model, based on XLM."""
+"""PyTorch Flaubert model, based on XLM."""
 
 import itertools
 import math
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index 0538fa061c..71e371da24 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- TF 2.0 Flaubert model.
+TF 2.0 Flaubert model.
 """
 
-
 from __future__ import annotations
 
 import itertools
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index 20f9926422..d8598a5a99 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for Flaubert."""
 
-
 import json
 import os
 import re
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index b78b5fdeac..9a6da69193 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" FLAVA model configurations"""
+"""FLAVA model configurations"""
 
 import os
 from typing import Any, Dict, Union
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 661bfb2dce..5acbad05c3 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch FLAVA model."""
+"""PyTorch FLAVA model."""
 
 import collections
 import math
diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py
index 18ad618799..90b77fc5d7 100644
--- a/src/transformers/models/fnet/configuration_fnet.py
+++ b/src/transformers/models/fnet/configuration_fnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" FNet model configuration"""
+"""FNet model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
index f77a44874a..71660354db 100644
--- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert FNet checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index a29a6f8016..8221af6d76 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch FNet model."""
+"""PyTorch FNet model."""
 
 import warnings
 from dataclasses import dataclass
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index a38114eb6d..29095c80ff 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for FNet model."""
+"""Tokenization classes for FNet model."""
 
 import os
 import unicodedata
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
index f279ad9ca7..3136b9f27c 100644
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for FNet model."""
-
+"""Tokenization classes for FNet model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/focalnet/configuration_focalnet.py b/src/transformers/models/focalnet/configuration_focalnet.py
index 1b2cca8b48..577530e2ec 100644
--- a/src/transformers/models/focalnet/configuration_focalnet.py
+++ b/src/transformers/models/focalnet/configuration_focalnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" FocalNet model configuration"""
+"""FocalNet model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py
index 5958eb6a0e..99f2dc658f 100644
--- a/src/transformers/models/focalnet/modeling_focalnet.py
+++ b/src/transformers/models/focalnet/modeling_focalnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch FocalNet model."""
-
+"""PyTorch FocalNet model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index 7ed34a6792..72af4ddab2 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" FSMT configuration"""
-
+"""FSMT configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 0df585ed46..d1f1ee4cac 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for FSMT."""
 
-
 import json
 import os
 import re
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index c3d6ff9ee4..53d072d4c8 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Funnel Transformer model configuration"""
+"""Funnel Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
index 848101f083..4eab188f2a 100755
--- a/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/funnel/convert_funnel_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Funnel checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 1e7eaee161..b4fdfd5fc5 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Funnel Transformer model."""
+"""PyTorch Funnel Transformer model."""
 
 import os
 from dataclasses import dataclass
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index dea3ad550c..ab5f14a4c6 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Funnel model."""
-
+"""TF 2.0 Funnel model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index a1580deccf..6a710d660c 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for Funnel Transformer."""
+"""Tokenization class for Funnel Transformer."""
 
 import collections
 import os
diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py
index 9ff2a3bfef..6a48f2f54a 100644
--- a/src/transformers/models/funnel/tokenization_funnel_fast.py
+++ b/src/transformers/models/funnel/tokenization_funnel_fast.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for Funnel Transformer."""
+"""Tokenization class for Funnel Transformer."""
 
 import json
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py
index 8a5013a651..6cf666d7ee 100644
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fuyu model configuration"""
+"""Fuyu model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py
index bdaec5f868..e716e9f334 100644
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Fuyu model."""
+"""PyTorch Fuyu model."""
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index ffa215f1a0..2e46cabfa3 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -15,6 +15,7 @@
 """
 Image/Text processor class for GIT
 """
+
 import re
 from typing import Dict, List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py
index c8a5504516..3bf296a63b 100644
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Gemma model configuration"""
+"""Gemma model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/gemma/modeling_flax_gemma.py b/src/transformers/models/gemma/modeling_flax_gemma.py
index 235f65680f..16291f3c3a 100644
--- a/src/transformers/models/gemma/modeling_flax_gemma.py
+++ b/src/transformers/models/gemma/modeling_flax_gemma.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Flax Gemma model."""
+
 from typing import Optional, Tuple
 
 import flax.linen as nn
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 6003c91cee..565a976fd7 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Gemma model."""
+"""PyTorch Gemma model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/gemma/tokenization_gemma.py b/src/transformers/models/gemma/tokenization_gemma.py
index ab19ee23c7..f70c6e807e 100644
--- a/src/transformers/models/gemma/tokenization_gemma.py
+++ b/src/transformers/models/gemma/tokenization_gemma.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 """Tokenization classes for Gemma."""
+
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py
index 4e3e8e7b31..238b8124a0 100644
--- a/src/transformers/models/git/convert_git_to_pytorch.py
+++ b/src/transformers/models/git/convert_git_to_pytorch.py
@@ -16,7 +16,6 @@
 
 URL: https://github.com/microsoft/GenerativeImage2Text/tree/main"""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index 87b0cb8073..8e14e3a899 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """PyTorch GIT model."""
 
-
 import math
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/glpn/configuration_glpn.py b/src/transformers/models/glpn/configuration_glpn.py
index e1e4b53505..88e1d6e1f0 100644
--- a/src/transformers/models/glpn/configuration_glpn.py
+++ b/src/transformers/models/glpn/configuration_glpn.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GLPN model configuration"""
+"""GLPN model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
index 5f0183783e..e19ee93819 100644
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert GLPN checkpoints."""
 
-
 import argparse
 from collections import OrderedDict
 from pathlib import Path
diff --git a/src/transformers/models/glpn/modeling_glpn.py b/src/transformers/models/glpn/modeling_glpn.py
index a9a8bc9b36..9fd22ca0f7 100755
--- a/src/transformers/models/glpn/modeling_glpn.py
+++ b/src/transformers/models/glpn/modeling_glpn.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GLPN model."""
-
+"""PyTorch GLPN model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index 249decbbaa..82a2491295 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" OpenAI GPT-2 configuration"""
+"""OpenAI GPT-2 configuration"""
+
 from collections import OrderedDict
 from typing import Any, List, Mapping, Optional
 
diff --git a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
index 066ba06503..33f9dabed0 100755
--- a/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/gpt2/convert_gpt2_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert OpenAI GPT checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index c6f5883e22..379cbbfaf1 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 OpenAI GPT-2 model."""
+"""TF 2.0 OpenAI GPT-2 model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index 9e81b4473e..9bca559d9e 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-
 import json
 import os
 from functools import lru_cache
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index 39e59298c8..e6747119f4 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-
 import json
 from typing import Optional, Tuple
 
diff --git a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
index 0c295e17a1..5bd72d23f9 100644
--- a/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/configuration_gpt_bigcode.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GPTBigCode configuration"""
+"""GPTBigCode configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 7b7bfaf1d4..e03a1a3394 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PyTorch GPTBigCode model."""
+
 import math
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
index 66c3b6812d..a3c261e855 100644
--- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GPT Neo model configuration"""
+"""GPT Neo model configuration"""
 
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
diff --git a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
index 4a5fddd0a9..3db2285729 100644
--- a/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
+++ b/src/transformers/models/gpt_neo/convert_gpt_neo_mesh_tf_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert GPT Neo checkpoint."""
 
-
 import argparse
 import json
 
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index b2891526b3..694c0bc88b 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPT Neo model."""
-
+"""PyTorch GPT Neo model."""
 
 import os
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index d559148a72..8e4c94692e 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GPTNeoX model configuration"""
+"""GPTNeoX model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 4980f7c636..ba2fb8aa76 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPTNeoX model."""
+"""PyTorch GPTNeoX model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
index ba54a1dc7d..2504fa3cc0 100644
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for GPTNeoX."""
+
 import json
 from typing import List, Optional, Tuple
 
diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
index bde828790d..d3c18a3643 100644
--- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GPTNeoX Japanese model configuration"""
+"""GPTNeoX Japanese model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 24d2113178..b9c4cad0fd 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPTNeoX model."""
+"""PyTorch GPTNeoX model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index b761e539a5..f36f7e3fd6 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for GPTNeoXJapanese."""
+
 import collections
 import json
 import os
diff --git a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
index 5562efa287..2625701c1a 100644
--- a/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
+++ b/src/transformers/models/gpt_sw3/convert_megatron_to_pytorch.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Convert GPT-SW3 megatron checkpoints to pytorch"""
+"""Convert GPT-SW3 megatron checkpoints to pytorch"""
 
 import argparse
 import os
diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py
index c9c6c4a1c5..1b93f259b0 100644
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GPT-J model configuration"""
+"""GPT-J model configuration"""
+
 from collections import OrderedDict
 from typing import Any, List, Mapping, Optional
 
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 5f1e1d6612..96f4197a87 100644
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPT-J model."""
+"""PyTorch GPT-J model."""
 
 import warnings
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index 5c315b5b66..b20512b19d 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 GPT-J model."""
+"""TF 2.0 GPT-J model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
index 7630d67bff..23295f3510 100644
--- a/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/configuration_gptsan_japanese.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""  GPTSAN-japanese model configuration"""
+"""GPTSAN-japanese model configuration"""
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index f897a26483..7faafd9efb 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GPTSANJapanese model."""
-
+"""PyTorch GPTSANJapanese model."""
 
 import copy
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
index 627f7a9b28..b21b8a6f23 100644
--- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for GPTSANJapanese."""
+
 import collections
 import json
 import os
diff --git a/src/transformers/models/graphormer/configuration_graphormer.py b/src/transformers/models/graphormer/configuration_graphormer.py
index 7aaba3aa76..9f6904ef38 100644
--- a/src/transformers/models/graphormer/configuration_graphormer.py
+++ b/src/transformers/models/graphormer/configuration_graphormer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Graphormer model configuration"""
+"""Graphormer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py
index 45ffb91a91..f2696a586b 100755
--- a/src/transformers/models/graphormer/modeling_graphormer.py
+++ b/src/transformers/models/graphormer/modeling_graphormer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Graphormer model."""
+"""PyTorch Graphormer model."""
 
 import math
 from typing import Iterable, Iterator, List, Optional, Tuple, Union
diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
index 177b4bab7e..4c70abf5bd 100644
--- a/src/transformers/models/grounding_dino/configuration_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Grounding DINO model configuration"""
+"""Grounding DINO model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
index 72fc14edeb..1afe3ad44c 100644
--- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Grounding DINO model."""
+"""PyTorch Grounding DINO model."""
 
 import copy
 import math
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index 1f17a0a7e6..4051fd069d 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" GroupViT model configuration"""
+"""GroupViT model configuration"""
 
 import os
 from collections import OrderedDict
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index aa8e803357..99be160319 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch GroupViT model."""
-
+"""PyTorch GroupViT model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 78f7f793c3..f06c5f57f8 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 GroupViT model."""
-
+"""TF 2.0 GroupViT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index 293b9f76f2..20977cff87 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Hubert model configuration"""
+"""Hubert model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
index 571761e022..f5914f35c5 100644
--- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
+++ b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Hubert checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
index 9a70fb6db7..6478fdadf1 100644
--- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Hubert checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
index 51908f9302..ff15b90088 100644
--- a/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
+++ b/src/transformers/models/hubert/convert_hubert_original_s3prl_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Hubert checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 83d90f5ded..388d16415c 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Hubert model."""
+"""PyTorch Hubert model."""
 
 import warnings
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 142ab0961d..4c31fc78c2 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow Hubert model."""
+"""TensorFlow Hubert model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py
index afa67d6d6a..9af660669d 100644
--- a/src/transformers/models/ibert/configuration_ibert.py
+++ b/src/transformers/models/ibert/configuration_ibert.py
@@ -14,7 +14,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" I-BERT configuration"""
+"""I-BERT configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py
index 8b61238ed9..e286ef3705 100644
--- a/src/transformers/models/idefics/configuration_idefics.py
+++ b/src/transformers/models/idefics/configuration_idefics.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Idefics model configuration"""
+"""Idefics model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index 2ed51413d9..6d65825986 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Idefics model."""
+"""PyTorch Idefics model."""
+
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/idefics/modeling_tf_idefics.py b/src/transformers/models/idefics/modeling_tf_idefics.py
index 8d9322b0ed..c5ce2935d3 100644
--- a/src/transformers/models/idefics/modeling_tf_idefics.py
+++ b/src/transformers/models/idefics/modeling_tf_idefics.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Idefics model. """
+"""TF 2.0 Idefics model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/idefics/perceiver.py b/src/transformers/models/idefics/perceiver.py
index 888c5b0bb9..91e80f8516 100644
--- a/src/transformers/models/idefics/perceiver.py
+++ b/src/transformers/models/idefics/perceiver.py
@@ -36,6 +36,7 @@ References:
     - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
 
 """
+
 from typing import Optional, Tuple
 
 import torch
diff --git a/src/transformers/models/idefics/perceiver_tf.py b/src/transformers/models/idefics/perceiver_tf.py
index c9e76004a7..0f1c615349 100644
--- a/src/transformers/models/idefics/perceiver_tf.py
+++ b/src/transformers/models/idefics/perceiver_tf.py
@@ -36,6 +36,7 @@ References:
     - Code borrowed w/ love from: https://github.com/lucidrains/flamingo-pytorch
 
 """
+
 from typing import Optional, Tuple
 
 import tensorflow as tf
diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py
index d90f837b3c..847e92e89c 100644
--- a/src/transformers/models/idefics/vision.py
+++ b/src/transformers/models/idefics/vision.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
-
+"""PyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/idefics/vision_tf.py b/src/transformers/models/idefics/vision_tf.py
index 0060bb7ac9..7acfa01939 100644
--- a/src/transformers/models/idefics/vision_tf.py
+++ b/src/transformers/models/idefics/vision_tf.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
-
+"""TF IdeficsVision model: a copy of CLIPVisionModel using a simpler config object"""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/imagegpt/configuration_imagegpt.py b/src/transformers/models/imagegpt/configuration_imagegpt.py
index 906e61eeef..c54c11491c 100644
--- a/src/transformers/models/imagegpt/configuration_imagegpt.py
+++ b/src/transformers/models/imagegpt/configuration_imagegpt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" OpenAI ImageGPT configuration"""
+"""OpenAI ImageGPT configuration"""
 
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional
diff --git a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
index 0212bd485b..182d66b9af 100644
--- a/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
+++ b/src/transformers/models/imagegpt/convert_imagegpt_original_tf2_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert OpenAI Image GPT checkpoints."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 844bf474fa..6b5507a015 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Informer model."""
+"""PyTorch Informer model."""
 
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
index 23b743ecb5..636d18b990 100644
--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" InstructBLIP model configuration"""
+"""InstructBLIP model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 0fb089b627..386b69cd3b 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch InstructBLIP model."""
+"""PyTorch InstructBLIP model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py
index de9cd378bd..6394c74012 100644
--- a/src/transformers/models/jamba/configuration_jamba.py
+++ b/src/transformers/models/jamba/configuration_jamba.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Jamba model configuration"""
+"""Jamba model configuration"""
+
 import math
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index 5a582acdd2..3510b8fb5a 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Jamba model."""
+"""PyTorch Jamba model."""
+
 import inspect
 import math
 from typing import Any, Dict, List, Optional, Tuple, Union
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 19203732a9..a2eee03885 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Jukebox configuration"""
+"""Jukebox configuration"""
 
 import os
 from typing import List, Union
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index cd478d6f6b..4952adda64 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for OpenAI Jukebox."""
 
-
 import json
 import os
 import re
diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py
index f922b60934..fc5f15dcae 100644
--- a/src/transformers/models/kosmos2/configuration_kosmos2.py
+++ b/src/transformers/models/kosmos2/configuration_kosmos2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" KOSMOS-2 model configuration"""
+"""KOSMOS-2 model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py
index 161ebbf95c..9585bd891e 100644
--- a/src/transformers/models/kosmos2/modeling_kosmos2.py
+++ b/src/transformers/models/kosmos2/modeling_kosmos2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch KOSMOS-2 model."""
-
+"""PyTorch KOSMOS-2 model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py
index 88081831bc..4198bb26e9 100644
--- a/src/transformers/models/layoutlm/configuration_layoutlm.py
+++ b/src/transformers/models/layoutlm/configuration_layoutlm.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LayoutLM model configuration"""
+"""LayoutLM model configuration"""
+
 from collections import OrderedDict
 from typing import Any, List, Mapping, Optional
 
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index dbaa1aff81..4a761fcc0d 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LayoutLM model."""
-
+"""PyTorch LayoutLM model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 42bcbe00d9..5e95f3a3b5 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 LayoutLM model."""
-
+"""TF 2.0 LayoutLM model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index 836b1aab88..fa6a5f29e9 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model LayoutLM."""
+"""Tokenization class for model LayoutLM."""
 
 import collections
 import os
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
index fa3d95132b..db1409dfca 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model LayoutLM."""
+"""Tokenization class for model LayoutLM."""
 
 import json
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
index 6a2eb9ff39..db1fdf7da2 100644
--- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LayoutLMv2 model configuration"""
+"""LayoutLMv2 model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import is_detectron2_available, logging
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 3e3b7f2b14..70cb1f6e9f 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LayoutLMv2 model."""
+"""PyTorch LayoutLMv2 model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
index 592adaa2f5..aa50a3228e 100644
--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LayoutLMv3 model configuration"""
+"""LayoutLMv3 model configuration"""
 
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional
diff --git a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
index 7479754f0f..6415f43247 100644
--- a/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """TF 2.0 LayoutLMv3 model."""
 
-
 from __future__ import annotations
 
 import collections
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index b1d885255b..1cbd3f20c2 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -15,6 +15,7 @@
 """
 Processor class for LayoutXLM.
 """
+
 import warnings
 from typing import List, Optional, Union
 
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index bbfdf44a1e..3ab57ac892 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for LayoutXLM model."""
-
+"""Tokenization classes for LayoutXLM model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index e899d8b22e..bd6533598d 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for LayoutXLM model."""
-
+"""Tokenization classes for LayoutXLM model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py
index 9933ef2253..9ed3b148c7 100644
--- a/src/transformers/models/led/configuration_led.py
+++ b/src/transformers/models/led/configuration_led.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LED model configuration"""
+"""LED model configuration"""
 
 from typing import List, Union
 
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 982de403c2..41b6c0a2be 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LED model."""
-
+"""PyTorch LED model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index f64ed7758d..8c414648d6 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 LED model."""
-
+"""TF 2.0 LED model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/levit/configuration_levit.py b/src/transformers/models/levit/configuration_levit.py
index 197e5fe5ec..5b04930959 100644
--- a/src/transformers/models/levit/configuration_levit.py
+++ b/src/transformers/models/levit/configuration_levit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LeViT model configuration"""
+"""LeViT model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
index 6f285a6de3..afef3f73de 100644
--- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+++ b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert LeViT checkpoints from timm."""
 
-
 import argparse
 import json
 from collections import OrderedDict
diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py
index 0de3789c2c..af202787a1 100644
--- a/src/transformers/models/levit/modeling_levit.py
+++ b/src/transformers/models/levit/modeling_levit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LeViT model."""
+"""PyTorch LeViT model."""
 
 import itertools
 from dataclasses import dataclass
diff --git a/src/transformers/models/lilt/configuration_lilt.py b/src/transformers/models/lilt/configuration_lilt.py
index fdfa2192a1..57ab8884ed 100644
--- a/src/transformers/models/lilt/configuration_lilt.py
+++ b/src/transformers/models/lilt/configuration_lilt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LiLT configuration"""
+"""LiLT configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py
index b406b12fc7..699fd0199f 100644
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LLaMA model configuration"""
+"""LLaMA model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 1e7fb5d1a1..1c9f1c4adc 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -18,6 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Flax LLaMA model."""
+
 from functools import partial
 from typing import Optional, Tuple
 
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 2b4920c668..5392afb763 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -19,6 +19,7 @@
 # limitations under the License.
 
 """Tokenization classes for LLaMA."""
+
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index f146014505..6930dcc78c 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Llava model configuration"""
+"""Llava model configuration"""
 
 import warnings
 
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index ff010f7442..7016cd5009 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -16,7 +16,6 @@
 Processor class for Llava.
 """
 
-
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py
index 21f3d499e3..3111393867 100644
--- a/src/transformers/models/llava_next/configuration_llava_next.py
+++ b/src/transformers/models/llava_next/configuration_llava_next.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Llava-NeXT model configuration"""
+"""Llava-NeXT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py
index 0cafbf5d69..fc60937637 100644
--- a/src/transformers/models/longformer/configuration_longformer.py
+++ b/src/transformers/models/longformer/configuration_longformer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Longformer configuration"""
+"""Longformer configuration"""
+
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, List, Mapping, Optional, Union
 
diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
index ed7d32ab3e..4ef2131228 100644
--- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
+++ b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert RoBERTa checkpoint."""
 
-
 import argparse
 
 import pytorch_lightning as pl
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index be3ccf761b..b32cde202c 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tensorflow Longformer model."""
 
-
 from __future__ import annotations
 
 import warnings
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index 02b74818a2..d4b4228b03 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Tokenization classes for Longformer."""
+
 import json
 from typing import List, Optional, Tuple
 
diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py
index 839428f27e..0e541ae2a1 100644
--- a/src/transformers/models/longt5/configuration_longt5.py
+++ b/src/transformers/models/longt5/configuration_longt5.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LongT5 model configuration"""
+"""LongT5 model configuration"""
+
 from typing import Mapping
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
index 5a1394c719..cf5c2d52d8 100644
--- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
+++ b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
@@ -82,9 +82,9 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
 
         # Global input layer norm
         if config.model_type == "longt5" and config.encoder_attention_type == "transient-global":
-            flax_model_encoder_layer_block["0"][encoder_attn_name]["global_input_layer_norm"][
-                "weight"
-            ] = t5x_global_layer_norm
+            flax_model_encoder_layer_block["0"][encoder_attn_name]["global_input_layer_norm"]["weight"] = (
+                t5x_global_layer_norm
+            )
 
         if split_mlp_wi:
             flax_model_encoder_layer_block["1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index d47f644ba3..4ab18a3ca7 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax LongT5 model."""
-
+"""Flax LongT5 model."""
 
 import copy
 from typing import Any, Callable, List, Optional, Tuple
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 314785b82c..b2a6ed11ca 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LongT5 model."""
-
+"""PyTorch LongT5 model."""
 
 import copy
 import math
diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py
index d3f14decac..44e1002cfb 100644
--- a/src/transformers/models/luke/configuration_luke.py
+++ b/src/transformers/models/luke/configuration_luke.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LUKE configuration"""
+"""LUKE configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py
index ec3a7c1864..d753e75227 100644
--- a/src/transformers/models/lxmert/configuration_lxmert.py
+++ b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" LXMERT model configuration"""
-
+"""LXMERT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
index f8eb86f1d1..1dd77bc36f 100755
--- a/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/lxmert/convert_lxmert_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert LXMERT checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 6e2ae7d22e..a7f0fea8f4 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LXMERT model."""
-
+"""PyTorch LXMERT model."""
 
 import math
 import os
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index 2b51fb7510..8a833fb35a 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -14,8 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 LXMERT model."""
-
+"""TF 2.0 LXMERT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py
index 73840e5132..7ae3c44127 100644
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" M2M100 model configuration"""
+"""M2M100 model configuration"""
+
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
 
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index 96f79ee4e7..403d8cc507 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for M2M100."""
+
 import json
 import os
 from pathlib import Path
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 5921fde981..5a3f083804 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Marian model configuration"""
+"""Marian model configuration"""
+
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
 
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index 2002d60caa..e33df2e06b 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax Marian model."""
+"""Flax Marian model."""
 
 import math
 import random
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 10d7f1b6b2..2045f67354 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch MarianMTModel model, ported from the Marian C++ repo."""
 
-
 import copy
 import math
 from typing import Dict, List, Optional, Tuple, Union
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index c6d5355f70..30c6157d50 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Marian model."""
-
+"""TF 2.0 Marian model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/markuplm/configuration_markuplm.py b/src/transformers/models/markuplm/configuration_markuplm.py
index 581cc0f349..e348a5c5a1 100644
--- a/src/transformers/models/markuplm/configuration_markuplm.py
+++ b/src/transformers/models/markuplm/configuration_markuplm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MarkupLM model configuration"""
+"""MarkupLM model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index 29f433ae0f..a3aa69621c 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MarkupLM model."""
+"""PyTorch MarkupLM model."""
 
 import math
 import os
diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py
index 81aaca9e5c..757c146c58 100644
--- a/src/transformers/models/markuplm/processing_markuplm.py
+++ b/src/transformers/models/markuplm/processing_markuplm.py
@@ -15,6 +15,7 @@
 """
 Processor class for MarkupLM.
 """
+
 from typing import Optional, Union
 
 from ...file_utils import TensorType
diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py
index ed97ed2458..0a49127a52 100644
--- a/src/transformers/models/mask2former/configuration_mask2former.py
+++ b/src/transformers/models/mask2former/configuration_mask2former.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Mask2Former model configuration"""
+"""Mask2Former model configuration"""
+
 from typing import Dict, List, Optional
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index f37b5b14fc..faaca46ed2 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Mask2Former model."""
+"""PyTorch Mask2Former model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py
index f82fe199b7..149f3cb52f 100644
--- a/src/transformers/models/maskformer/configuration_maskformer.py
+++ b/src/transformers/models/maskformer/configuration_maskformer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MaskFormer model configuration"""
+"""MaskFormer model configuration"""
+
 from typing import Dict, Optional
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/maskformer/configuration_maskformer_swin.py b/src/transformers/models/maskformer/configuration_maskformer_swin.py
index 56d8f746db..1cc2feffbf 100644
--- a/src/transformers/models/maskformer/configuration_maskformer_swin.py
+++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MaskFormer Swin Transformer model configuration"""
+"""MaskFormer Swin Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
index fec508de41..873498fa00 100644
--- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
@@ -15,7 +15,6 @@
 """Convert MaskFormer checkpoints with ResNet backbone from the original repository. URL:
 https://github.com/facebookresearch/MaskFormer"""
 
-
 import argparse
 import json
 import pickle
diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
index 8f0d0e99df..8f8441ab8f 100644
--- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
@@ -15,7 +15,6 @@
 """Convert MaskFormer checkpoints with Swin backbone from the original repository. URL:
 https://github.com/facebookresearch/MaskFormer"""
 
-
 import argparse
 import json
 import pickle
diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py
index 2c2746603a..271ad5cc07 100644
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MaskFormer model."""
+"""PyTorch MaskFormer model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index 4823047dcf..8a4fe14b6c 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MBART model configuration"""
+"""MBART model configuration"""
+
 from collections import OrderedDict
 from typing import Any, Mapping, Optional
 
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 907fd53aa1..0f943df13c 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax MBart model."""
+"""Flax MBart model."""
 
 import math
 import random
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 34eab5b8f3..a7f7be3a85 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MBART model."""
+"""PyTorch MBART model."""
+
 import copy
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index 2c134b520d..8c9bb98120 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 MBart model."""
-
+"""TF 2.0 MBart model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/mega/configuration_mega.py b/src/transformers/models/mega/configuration_mega.py
index 8287a3938e..b090a020af 100644
--- a/src/transformers/models/mega/configuration_mega.py
+++ b/src/transformers/models/mega/configuration_mega.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MEGA configuration"""
+"""MEGA configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
index 2fe75ba273..1f791dab24 100644
--- a/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
@@ -24,6 +24,7 @@ Requirements:
   - clone the pretrained weights for the original implementation from the hugging face repo
     * use this location as the path for pretrained weights
 """
+
 import argparse
 
 # utilities to import the model weights and config file
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index d3be4db99b..a0e216a535 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MEGATRON_BERT model configuration"""
+"""MEGATRON_BERT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 270caf9856..ff0f536396 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MegatronBERT model."""
-
+"""PyTorch MegatronBERT model."""
 
 import math
 import os
diff --git a/src/transformers/models/mgp_str/configuration_mgp_str.py b/src/transformers/models/mgp_str/configuration_mgp_str.py
index 2ce4ffd0c6..d7850342dc 100644
--- a/src/transformers/models/mgp_str/configuration_mgp_str.py
+++ b/src/transformers/models/mgp_str/configuration_mgp_str.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MGP-STR model configuration"""
+"""MGP-STR model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/mgp_str/modeling_mgp_str.py b/src/transformers/models/mgp_str/modeling_mgp_str.py
index 6754cebcae..6b18c45e01 100644
--- a/src/transformers/models/mgp_str/modeling_mgp_str.py
+++ b/src/transformers/models/mgp_str/modeling_mgp_str.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MGP-STR model."""
+"""PyTorch MGP-STR model."""
 
 import collections.abc
 from dataclasses import dataclass
diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py
index e281802792..5f3e706339 100644
--- a/src/transformers/models/mistral/configuration_mistral.py
+++ b/src/transformers/models/mistral/configuration_mistral.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Mistral model configuration"""
+"""Mistral model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/mistral/modeling_flax_mistral.py b/src/transformers/models/mistral/modeling_flax_mistral.py
index 3480fc7214..3bff2a6281 100644
--- a/src/transformers/models/mistral/modeling_flax_mistral.py
+++ b/src/transformers/models/mistral/modeling_flax_mistral.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax Mistral model."""
+"""Flax Mistral model."""
+
 from typing import Optional, Tuple
 
 import flax.linen as nn
diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py
index 5304afd513..d40f516e58 100644
--- a/src/transformers/models/mixtral/configuration_mixtral.py
+++ b/src/transformers/models/mixtral/configuration_mixtral.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Mixtral model configuration"""
+"""Mixtral model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
index 70e1746392..4c694de0c3 100644
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Mixtral model."""
+"""PyTorch Mixtral model."""
+
 import inspect
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index 3ef5e64ed2..004f6526f5 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for mLUKE."""
-
+"""Tokenization classes for mLUKE."""
 
 import itertools
 import json
diff --git a/src/transformers/models/mobilebert/configuration_mobilebert.py b/src/transformers/models/mobilebert/configuration_mobilebert.py
index 197e29fcfe..2370fa9b57 100644
--- a/src/transformers/models/mobilebert/configuration_mobilebert.py
+++ b/src/transformers/models/mobilebert/configuration_mobilebert.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MobileBERT model configuration"""
+"""MobileBERT model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index bab2cbac8e..d73c276b4f 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 MobileBERT model."""
-
+"""TF 2.0 MobileBERT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index ccfdcc31ff..32dc995668 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """Tokenization classes for MobileBERT."""
 
-
 import collections
 import os
 import unicodedata
diff --git a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
index 70075bcc94..2bf204a66d 100644
--- a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MobileNetV1 model configuration"""
+"""MobileNetV1 model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
index 4985e0ff22..1b53bbeab4 100644
--- a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert MobileNetV1 checkpoints from the tensorflow/models library."""
 
-
 import argparse
 import json
 import re
diff --git a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
index 85a398f898..00f8c501b2 100755
--- a/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MobileNetV1 model."""
-
+"""PyTorch MobileNetV1 model."""
 
 from typing import Optional, Union
 
diff --git a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
index 81e590d5a3..25bcfa5785 100644
--- a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MobileNetV2 model configuration"""
+"""MobileNetV2 model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
index 443bf8fd7e..1fdb9783cc 100644
--- a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert MobileNetV2 checkpoints from the tensorflow/models library."""
 
-
 import argparse
 import json
 import re
diff --git a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
index 3c8fd5193a..7eb231380e 100755
--- a/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/modeling_mobilenet_v2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MobileNetV2 model."""
-
+"""PyTorch MobileNetV2 model."""
 
 from typing import Optional, Union
 
@@ -134,29 +133,29 @@ def _build_tf_to_pytorch_map(model, config, tf_weights=None):
         tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_pool.normalization.bias
         tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_pool.normalization.weight
         tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = model.segmentation_head.conv_pool.normalization.running_mean
-        tf_to_pt_map[
-            prefix + "BatchNorm/moving_variance"
-        ] = model.segmentation_head.conv_pool.normalization.running_var
+        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = (
+            model.segmentation_head.conv_pool.normalization.running_var
+        )
 
         prefix = "aspp0/"
         tf_to_pt_map[prefix + "weights"] = model.segmentation_head.conv_aspp.convolution.weight
         tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_aspp.normalization.bias
         tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_aspp.normalization.weight
         tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = model.segmentation_head.conv_aspp.normalization.running_mean
-        tf_to_pt_map[
-            prefix + "BatchNorm/moving_variance"
-        ] = model.segmentation_head.conv_aspp.normalization.running_var
+        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = (
+            model.segmentation_head.conv_aspp.normalization.running_var
+        )
 
         prefix = "concat_projection/"
         tf_to_pt_map[prefix + "weights"] = model.segmentation_head.conv_projection.convolution.weight
         tf_to_pt_map[prefix + "BatchNorm/beta"] = model.segmentation_head.conv_projection.normalization.bias
         tf_to_pt_map[prefix + "BatchNorm/gamma"] = model.segmentation_head.conv_projection.normalization.weight
-        tf_to_pt_map[
-            prefix + "BatchNorm/moving_mean"
-        ] = model.segmentation_head.conv_projection.normalization.running_mean
-        tf_to_pt_map[
-            prefix + "BatchNorm/moving_variance"
-        ] = model.segmentation_head.conv_projection.normalization.running_var
+        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = (
+            model.segmentation_head.conv_projection.normalization.running_mean
+        )
+        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = (
+            model.segmentation_head.conv_projection.normalization.running_var
+        )
 
         prefix = "logits/semantic/"
         tf_to_pt_map[ema(prefix + "weights")] = model.segmentation_head.classifier.convolution.weight
diff --git a/src/transformers/models/mobilevit/configuration_mobilevit.py b/src/transformers/models/mobilevit/configuration_mobilevit.py
index 5650002b3c..500f8b23db 100644
--- a/src/transformers/models/mobilevit/configuration_mobilevit.py
+++ b/src/transformers/models/mobilevit/configuration_mobilevit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MobileViT model configuration"""
+"""MobileViT model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
index e251b124b4..522d6671d1 100644
--- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert MobileViT checkpoints from the ml-cvnets library."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py
index 7e23d1b5ca..1931e975c8 100755
--- a/src/transformers/models/mobilevit/modeling_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_mobilevit.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 #
 # Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
-""" PyTorch MobileViT model."""
-
+"""PyTorch MobileViT model."""
 
 import math
 from typing import Dict, Optional, Set, Tuple, Union
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index 179f209e87..01bd35b7ab 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 # Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
-""" TensorFlow 2.0 MobileViT model."""
+"""TensorFlow 2.0 MobileViT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
index 957a43f770..65260d6501 100644
--- a/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
+++ b/src/transformers/models/mobilevitv2/configuration_mobilevitv2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MobileViTV2 model configuration"""
+"""MobileViTV2 model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
index 2e2d31295d..518dc949a4 100644
--- a/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevitv2/convert_mlcvnets_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert MobileViTV2 checkpoints from the ml-cvnets library."""
 
-
 import argparse
 import collections
 import json
diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
index 3d6c4c1b39..3db6582bcf 100644
--- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
+++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 #
 # Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
-""" PyTorch MobileViTV2 model."""
-
+"""PyTorch MobileViTV2 model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/mpnet/configuration_mpnet.py b/src/transformers/models/mpnet/configuration_mpnet.py
index 9c53e45d98..0abb89c942 100644
--- a/src/transformers/models/mpnet/configuration_mpnet.py
+++ b/src/transformers/models/mpnet/configuration_mpnet.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MPNet model configuration"""
+"""MPNet model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index bc811fd22f..11a27f5577 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """PyTorch MPNet model."""
 
-
 import math
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index f0afba869b..d1864bd197 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 MPNet model."""
-
+"""TF 2.0 MPNet model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py
index 5d18b1419e..ed822c813b 100644
--- a/src/transformers/models/mpt/configuration_mpt.py
+++ b/src/transformers/models/mpt/configuration_mpt.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Mpt configuration"""
+"""Mpt configuration"""
+
 from typing import TYPE_CHECKING, Optional, Union
 
 
diff --git a/src/transformers/models/mra/configuration_mra.py b/src/transformers/models/mra/configuration_mra.py
index 30c38795b5..6837de4f80 100644
--- a/src/transformers/models/mra/configuration_mra.py
+++ b/src/transformers/models/mra/configuration_mra.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MRA model configuration"""
+"""MRA model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py
index b47a41d73d..09b2136593 100644
--- a/src/transformers/models/mra/modeling_mra.py
+++ b/src/transformers/models/mra/modeling_mra.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MRA model."""
-
+"""PyTorch MRA model."""
 
 import math
 from pathlib import Path
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index 2d31a52563..ef629718b1 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" mT5 model configuration"""
+"""mT5 model configuration"""
+
 from typing import Mapping
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/mt5/modeling_flax_mt5.py b/src/transformers/models/mt5/modeling_flax_mt5.py
index 98406439df..fbb5b107f5 100644
--- a/src/transformers/models/mt5/modeling_flax_mt5.py
+++ b/src/transformers/models/mt5/modeling_flax_mt5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax mT5 model."""
+"""Flax mT5 model."""
 
 import jax.numpy as jnp
 
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 84a9f78ca9..1336b91961 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch mT5 model."""
+"""PyTorch mT5 model."""
 
 import copy
 import math
diff --git a/src/transformers/models/mt5/modeling_tf_mt5.py b/src/transformers/models/mt5/modeling_tf_mt5.py
index f8350eb197..7270a54948 100644
--- a/src/transformers/models/mt5/modeling_tf_mt5.py
+++ b/src/transformers/models/mt5/modeling_tf_mt5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tensorflow mT5 model."""
+"""Tensorflow mT5 model."""
 
 from ...utils import logging
 from ..t5.modeling_tf_t5 import TFT5EncoderModel, TFT5ForConditionalGeneration, TFT5Model
diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py
index 58be655f38..ef2e0244c1 100644
--- a/src/transformers/models/musicgen/configuration_musicgen.py
+++ b/src/transformers/models/musicgen/configuration_musicgen.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MusicGen model configuration"""
+"""MusicGen model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/musicgen/convert_musicgen_transformers.py b/src/transformers/models/musicgen/convert_musicgen_transformers.py
index a072ec321b..f4afd24df0 100644
--- a/src/transformers/models/musicgen/convert_musicgen_transformers.py
+++ b/src/transformers/models/musicgen/convert_musicgen_transformers.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Convert MusicGen checkpoints from the original repository."""
+
 import argparse
 from pathlib import Path
 from typing import Dict, OrderedDict, Tuple
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index d6f0ae96f4..810f34f780 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Musicgen model."""
+"""PyTorch Musicgen model."""
+
 import copy
 import inspect
 import math
diff --git a/src/transformers/models/musicgen/processing_musicgen.py b/src/transformers/models/musicgen/processing_musicgen.py
index 847c542a60..c153c5dfe1 100644
--- a/src/transformers/models/musicgen/processing_musicgen.py
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@@ -15,6 +15,7 @@
 """
 Text/audio processor class for MusicGen
 """
+
 from typing import List, Optional
 
 import numpy as np
diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
index 050bbbc2b6..b29187facb 100644
--- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Musicgen Melody model configuration"""
+"""Musicgen Melody model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py b/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
index 9e224d93f1..52980f73ec 100644
--- a/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
+++ b/src/transformers/models/musicgen_melody/convert_musicgen_melody_transformers.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Convert Musicgen Melody checkpoints from the original repository."""
+
 import argparse
 from pathlib import Path
 from typing import Dict, OrderedDict, Tuple
diff --git a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
index 2013309da5..ac83f3ac8d 100644
--- a/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py
@@ -15,6 +15,7 @@
 """
 Feature extractor class for Musicgen Melody
 """
+
 import copy
 from typing import Any, Dict, List, Optional, Union
 
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index 6458df0a1b..119628d50d 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Musicgen Melody model."""
+"""PyTorch Musicgen Melody model."""
+
 import copy
 import inspect
 import math
diff --git a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
index a474be38b4..34b1d1ec4d 100644
--- a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
@@ -15,6 +15,7 @@
 """
 Text/audio processor class for MusicGen Melody
 """
+
 from typing import List, Optional
 
 import numpy as np
diff --git a/src/transformers/models/mvp/configuration_mvp.py b/src/transformers/models/mvp/configuration_mvp.py
index 00f6b14249..8e2317982b 100644
--- a/src/transformers/models/mvp/configuration_mvp.py
+++ b/src/transformers/models/mvp/configuration_mvp.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MVP model configuration"""
+"""MVP model configuration"""
+
 import warnings
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 7c0f478567..319f1760ce 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch MVP model."""
+"""PyTorch MVP model."""
+
 import copy
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py
index baf0ea13a5..b20a60ac1e 100644
--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/nat/configuration_nat.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Neighborhood Attention Transformer model configuration"""
+"""Neighborhood Attention Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py
index b9c332c894..fa51801009 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Neighborhood Attention Transformer model."""
-
+"""PyTorch Neighborhood Attention Transformer model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index e10ba41696..30c8b6d890 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch Nezha model."""
 
-
 import math
 import os
 import warnings
diff --git a/src/transformers/models/nllb_moe/configuration_nllb_moe.py b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
index 98c8397c18..ef12c199ef 100644
--- a/src/transformers/models/nllb_moe/configuration_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/configuration_nllb_moe.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" NLLB-MoE model configuration"""
+"""NLLB-MoE model configuration"""
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
index 1f39dfd39b..2bec0fb84d 100644
--- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py
+++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch NLLB-MoE model."""
-
+"""PyTorch NLLB-MoE model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/nougat/convert_nougat_to_hf.py b/src/transformers/models/nougat/convert_nougat_to_hf.py
index ecc74fdb5f..e42f8553ac 100644
--- a/src/transformers/models/nougat/convert_nougat_to_hf.py
+++ b/src/transformers/models/nougat/convert_nougat_to_hf.py
@@ -113,22 +113,22 @@ def convert_state_dict(orig_state_dict, model):
                 orig_state_dict[
                     f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                 ] = val[:dim, :]
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
-                ] = val[dim : dim * 2, :]
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
+                    val[dim : dim * 2, :]
+                )
                 orig_state_dict[
                     f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                 ] = val[-dim:, :]
             else:
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
-                ] = val[:dim]
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
-                ] = val[dim : dim * 2]
-                orig_state_dict[
-                    f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
-                ] = val[-dim:]
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
+                    val[:dim]
+                )
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = (
+                    val[dim : dim * 2]
+                )
+                orig_state_dict[f"encoder.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
+                    val[-dim:]
+                )
         elif "attn_mask" in key or key in ["encoder.model.norm.weight", "encoder.model.norm.bias"]:
             # HuggingFace implementation doesn't use attn_mask buffer
             # and model doesn't use final LayerNorms for the encoder
diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat_fast.py
index ef6b613bba..0a7eec4ad9 100644
--- a/src/transformers/models/nougat/tokenization_nougat_fast.py
+++ b/src/transformers/models/nougat/tokenization_nougat_fast.py
@@ -15,6 +15,7 @@
 """
 Fast tokenizer class for Nougat.
 """
+
 import re
 from functools import partial
 from multiprocessing import Pool
diff --git a/src/transformers/models/nystromformer/configuration_nystromformer.py b/src/transformers/models/nystromformer/configuration_nystromformer.py
index ca277e266d..e52b02d9f8 100644
--- a/src/transformers/models/nystromformer/configuration_nystromformer.py
+++ b/src/transformers/models/nystromformer/configuration_nystromformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Nystromformer model configuration"""
+"""Nystromformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py
index cb027c314e..4bb4c33fff 100755
--- a/src/transformers/models/nystromformer/modeling_nystromformer.py
+++ b/src/transformers/models/nystromformer/modeling_nystromformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Nystromformer model."""
-
+"""PyTorch Nystromformer model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py
index 56cd01f7f2..440dc2ee9d 100644
--- a/src/transformers/models/olmo/configuration_olmo.py
+++ b/src/transformers/models/olmo/configuration_olmo.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" OLMo model configuration"""
+"""OLMo model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/oneformer/configuration_oneformer.py b/src/transformers/models/oneformer/configuration_oneformer.py
index f3c01191d9..57bff716f4 100644
--- a/src/transformers/models/oneformer/configuration_oneformer.py
+++ b/src/transformers/models/oneformer/configuration_oneformer.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """OneFormer model configuration"""
+
 from typing import Dict, Optional
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index fff665b2ff..9c2f662207 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OneFormer model."""
+"""PyTorch OneFormer model."""
+
 import copy
 import math
 import warnings
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index 64411455ee..dde668b32f 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" OpenAI GPT configuration"""
+"""OpenAI GPT configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
index 1b101aea0c..3d5218c204 100755
--- a/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/openai/convert_openai_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert OpenAI GPT checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 1c754daa0e..07b6f9a637 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 """PyTorch OpenAI GPT model."""
 
-
 import json
 import math
 import os
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index e2d0ae885c..20f5581c95 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 OpenAI GPT model."""
+"""TF 2.0 OpenAI GPT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index 4f2b279160..d7427aa429 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-
 import json
 import os
 import re
diff --git a/src/transformers/models/openai/tokenization_openai_fast.py b/src/transformers/models/openai/tokenization_openai_fast.py
index 214db53850..41f4c8db90 100644
--- a/src/transformers/models/openai/tokenization_openai_fast.py
+++ b/src/transformers/models/openai/tokenization_openai_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Fast Tokenization classes for OpenAI GPT."""
 
-
 from typing import Optional, Tuple
 
 from ...tokenization_utils_fast import PreTrainedTokenizerFast
diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py
index a9802d2ef3..455a6362a7 100644
--- a/src/transformers/models/opt/configuration_opt.py
+++ b/src/transformers/models/opt/configuration_opt.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" OPT model configuration"""
+"""OPT model configuration"""
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
diff --git a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
index 3f302b2ec3..486b477f97 100644
--- a/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/opt/convert_opt_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert OPT checkpoint."""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/opt/modeling_flax_opt.py b/src/transformers/models/opt/modeling_flax_opt.py
index 5d9839f120..c6296e4eea 100644
--- a/src/transformers/models/opt/modeling_flax_opt.py
+++ b/src/transformers/models/opt/modeling_flax_opt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax OPT model."""
+"""Flax OPT model."""
 
 from functools import partial
 from typing import Optional, Tuple
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index f93c3866ae..87695314b0 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OPT model."""
+"""PyTorch OPT model."""
+
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index 8dbad97e08..9c5dfa4ade 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 OPT model."""
-
+"""TF 2.0 OPT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py
index 4b09166b70..72d52a533d 100644
--- a/src/transformers/models/owlv2/configuration_owlv2.py
+++ b/src/transformers/models/owlv2/configuration_owlv2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" OWLv2 model configuration"""
+"""OWLv2 model configuration"""
 
 import os
 from typing import TYPE_CHECKING, Dict, Union
diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py
index c98846ea69..a7924085fc 100644
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OWLv2 model."""
+"""PyTorch OWLv2 model."""
 
 from dataclasses import dataclass
 from functools import lru_cache
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index 747f1c3ccb..2cbb061222 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" OWL-ViT model configuration"""
+"""OWL-ViT model configuration"""
 
 import os
 from collections import OrderedDict
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index c0676f519e..a7d8445523 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch OWL-ViT model."""
+"""PyTorch OWL-ViT model."""
 
 from dataclasses import dataclass
 from functools import lru_cache
diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py
index 7252425b85..d092142476 100644
--- a/src/transformers/models/paligemma/configuration_paligemma.py
+++ b/src/transformers/models/paligemma/configuration_paligemma.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PaliGemmamodel configuration"""
+"""PaliGemmamodel configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py b/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
index 0d43cac016..bcea5372e5 100644
--- a/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
+++ b/src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py
@@ -12,9 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert PaliGemma checkpoints from the original repository.
-"""
-
+"""Convert PaliGemma checkpoints from the original repository."""
 
 import argparse
 import collections
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index b2f2904b2f..bc900f0de7 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PaliGemmamodel."""
+"""PyTorch PaliGemmamodel."""
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 258954d856..f086795f43 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -16,7 +16,6 @@
 Processor class for PaliGemma.
 """
 
-
 import logging
 from typing import List, Optional, Union
 
diff --git a/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py b/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py
index c3766c3325..10089a3fef 100644
--- a/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py
+++ b/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PatchTSMixer model configuration"""
+"""PatchTSMixer model configuration"""
 
 from typing import List, Optional, Union
 
diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
index a824faa040..e4c8385697 100644
--- a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
+++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PatchTSMixer model."""
+"""PyTorch PatchTSMixer model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
index 884cd44c83..e30e457283 100755
--- a/src/transformers/models/patchtst/modeling_patchtst.py
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PatchTST model."""
+"""PyTorch PatchTST model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 7dff1a7f85..2cc49857f3 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PEGASUS model configuration"""
+"""PEGASUS model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index f822af1f22..e50fc1710c 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax PEGASUS model."""
-
+"""Flax PEGASUS model."""
 
 import math
 import random
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index f151ae9940..42cef3a635 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PEGASUS model."""
+"""PyTorch PEGASUS model."""
 
 import copy
 import math
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index a3acdc027f..45e9fdbbed 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Pegasus model."""
-
+"""TF 2.0 Pegasus model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
index f1252e959e..11ccb1ff4a 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model PEGASUS."""
-
+"""Tokenization class for model PEGASUS."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
index 166f3b18ab..b84c79656e 100644
--- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PEGASUS-X model configuration"""
+"""PEGASUS-X model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index ba99256a16..6d9072777b 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PEGASUS-X model."""
+"""PyTorch PEGASUS-X model."""
 
 import dataclasses
 import math
diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py
index b4b996aef0..e2c9cca4c3 100644
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Perceiver model configuration"""
+"""Perceiver model configuration"""
 
 from collections import OrderedDict
 from typing import Any, Mapping, Optional, Union
diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
index 1ea9798127..190b4f51f6 100644
--- a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
+++ b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Perceiver checkpoints originally implemented in Haiku."""
 
-
 import argparse
 import json
 import pickle
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index f768df991b..4921e292d6 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Perceiver model."""
+"""PyTorch Perceiver model."""
 
 import abc
 import math
diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py
index b4ec1e378e..90686b78dc 100644
--- a/src/transformers/models/perceiver/tokenization_perceiver.py
+++ b/src/transformers/models/perceiver/tokenization_perceiver.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for Perceiver."""
-
+"""Tokenization class for Perceiver."""
 
 from typing import Dict, List, Optional, Tuple
 
diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py
index 04bf792964..b8e02256de 100644
--- a/src/transformers/models/persimmon/configuration_persimmon.py
+++ b/src/transformers/models/persimmon/configuration_persimmon.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Persimmon model configuration"""
+"""Persimmon model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index ea2bd074ee..803169ddd5 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Persimmon model."""
+"""PyTorch Persimmon model."""
+
 import math
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py
index d221255f11..c67f07dcf5 100644
--- a/src/transformers/models/phi/configuration_phi.py
+++ b/src/transformers/models/phi/configuration_phi.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Phi model configuration"""
-
+"""Phi model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 1436138f91..d8c1f4a9b4 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" PyTorch Phi model."""
-
+"""PyTorch Phi model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py
index dfa576ad61..0e80566f54 100644
--- a/src/transformers/models/phi3/configuration_phi3.py
+++ b/src/transformers/models/phi3/configuration_phi3.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Phi-3 model configuration"""
-
+"""Phi-3 model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py
index 224aad0085..9ce7e44dce 100644
--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" PyTorch Phi-3 model."""
+"""PyTorch Phi-3 model."""
 
 import inspect
 import math
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index f312f49501..85450f4d8e 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for PhoBERT"""
-
+"""Tokenization classes for PhoBERT"""
 
 import os
 import re
diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py
index 2ad2509e44..24cf598cc4 100644
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Pix2Struct model configuration"""
+"""Pix2Struct model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
index 945708d69a..466997c8d8 100644
--- a/src/transformers/models/pix2struct/image_processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Image processor class for Pix2Struct."""
+
 import io
 import math
 from typing import Dict, Optional, Union
diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py
index 86ccb1dd74..94d882c805 100644
--- a/src/transformers/models/pix2struct/modeling_pix2struct.py
+++ b/src/transformers/models/pix2struct/modeling_pix2struct.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Pix2Struct modeling file"""
+"""Pix2Struct modeling file"""
 
 import math
 from typing import Dict, List, Optional, Tuple, Union
diff --git a/src/transformers/models/plbart/configuration_plbart.py b/src/transformers/models/plbart/configuration_plbart.py
index b899847b04..86dbc0cec8 100644
--- a/src/transformers/models/plbart/configuration_plbart.py
+++ b/src/transformers/models/plbart/configuration_plbart.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PLBART model configuration"""
+"""PLBART model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index e2b59f980c..93d91e1600 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PLBART model."""
+"""PyTorch PLBART model."""
+
 import copy
 import math
 from typing import Any, Dict, List, Optional, Tuple, Union
diff --git a/src/transformers/models/poolformer/configuration_poolformer.py b/src/transformers/models/poolformer/configuration_poolformer.py
index 1f297077fe..a7467b380e 100644
--- a/src/transformers/models/poolformer/configuration_poolformer.py
+++ b/src/transformers/models/poolformer/configuration_poolformer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PoolFormer model configuration"""
+"""PoolFormer model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/poolformer/modeling_poolformer.py b/src/transformers/models/poolformer/modeling_poolformer.py
index ae4e3ea0a7..e70974507b 100755
--- a/src/transformers/models/poolformer/modeling_poolformer.py
+++ b/src/transformers/models/poolformer/modeling_poolformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PoolFormer model."""
-
+"""PyTorch PoolFormer model."""
 
 import collections.abc
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/pop2piano/configuration_pop2piano.py b/src/transformers/models/pop2piano/configuration_pop2piano.py
index 8bb46b008d..51043dab0c 100644
--- a/src/transformers/models/pop2piano/configuration_pop2piano.py
+++ b/src/transformers/models/pop2piano/configuration_pop2piano.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Pop2Piano model configuration"""
-
+"""Pop2Piano model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
index a73c57886d..54b8bb67e6 100644
--- a/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
+++ b/src/transformers/models/pop2piano/convert_pop2piano_weights_to_hf.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" File for loading the Pop2Piano model weights from the official repository and to show how tokenizer vocab was
- constructed"""
+"""File for loading the Pop2Piano model weights from the official repository and to show how tokenizer vocab was
+constructed"""
 
 import json
 
diff --git a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
index 9bf5326c0b..738b932355 100644
--- a/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
+++ b/src/transformers/models/pop2piano/feature_extraction_pop2piano.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Feature extractor class for Pop2Piano"""
+"""Feature extractor class for Pop2Piano"""
 
 import warnings
 from typing import List, Optional, Union
diff --git a/src/transformers/models/pop2piano/modeling_pop2piano.py b/src/transformers/models/pop2piano/modeling_pop2piano.py
index e46f2907f4..7b8795e453 100644
--- a/src/transformers/models/pop2piano/modeling_pop2piano.py
+++ b/src/transformers/models/pop2piano/modeling_pop2piano.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Pop2Piano model."""
-
+"""PyTorch Pop2Piano model."""
 
 import copy
 import math
diff --git a/src/transformers/models/pop2piano/processing_pop2piano.py b/src/transformers/models/pop2piano/processing_pop2piano.py
index 639d2e7aea..280e5dc796 100644
--- a/src/transformers/models/pop2piano/processing_pop2piano.py
+++ b/src/transformers/models/pop2piano/processing_pop2piano.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Processor class for Pop2Piano."""
+"""Processor class for Pop2Piano."""
 
 import os
 from typing import List, Optional, Union
diff --git a/src/transformers/models/prophetnet/configuration_prophetnet.py b/src/transformers/models/prophetnet/configuration_prophetnet.py
index 1b40c9a2c0..7a9da32b3c 100644
--- a/src/transformers/models/prophetnet/configuration_prophetnet.py
+++ b/src/transformers/models/prophetnet/configuration_prophetnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ProphetNet model configuration"""
+"""ProphetNet model configuration"""
 
 from typing import Callable, Optional, Union
 
diff --git a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
index c9e64c06ef..3039056116 100644
--- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ProphetNet checkpoint."""
 
-
 import argparse
 
 from torch import nn
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index b7eca9c2b3..96fa2e2c12 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version)."""
+"""PyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version)."""
 
 import copy
 import math
diff --git a/src/transformers/models/pvt/configuration_pvt.py b/src/transformers/models/pvt/configuration_pvt.py
index 82b4822435..25348818f0 100644
--- a/src/transformers/models/pvt/configuration_pvt.py
+++ b/src/transformers/models/pvt/configuration_pvt.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Pvt model configuration"""
+"""Pvt model configuration"""
 
 from collections import OrderedDict
 from typing import Callable, List, Mapping
diff --git a/src/transformers/models/pvt/convert_pvt_to_pytorch.py b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
index 187f3200d6..73ae4c1571 100644
--- a/src/transformers/models/pvt/convert_pvt_to_pytorch.py
+++ b/src/transformers/models/pvt/convert_pvt_to_pytorch.py
@@ -16,7 +16,6 @@
 # limitations under the License.
 """Convert Pvt checkpoints from the original library."""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/pvt/modeling_pvt.py b/src/transformers/models/pvt/modeling_pvt.py
index f849b42e9c..306cc13122 100755
--- a/src/transformers/models/pvt/modeling_pvt.py
+++ b/src/transformers/models/pvt/modeling_pvt.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch PVT model."""
+"""PyTorch PVT model."""
 
 import collections
 import math
diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/qdqbert/configuration_qdqbert.py
index 40ae3cc310..9f1fdfe31d 100644
--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/qdqbert/configuration_qdqbert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" QDQBERT model configuration"""
+"""QDQBERT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index a10979b916..6078061873 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch QDQBERT model."""
-
+"""PyTorch QDQBERT model."""
 
 import math
 import os
diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py
index c2a99dfa8b..1c60d0dba9 100644
--- a/src/transformers/models/qwen2/configuration_qwen2.py
+++ b/src/transformers/models/qwen2/configuration_qwen2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Qwen2 model configuration"""
+"""Qwen2 model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py
index 98a9e27c9b..c037e28232 100644
--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Qwen2 model."""
+"""PyTorch Qwen2 model."""
+
 import inspect
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
index 942c0712be..a37f5218c6 100644
--- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Qwen2MoE model configuration"""
+"""Qwen2MoE model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index 2229e485db..5dd4d12c5e 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RAG model configuration"""
-
+"""RAG model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import add_start_docstrings
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 9d8ed65049..d7fb649908 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -15,7 +15,6 @@
 
 """TFRAG model implementation."""
 
-
 from __future__ import annotations
 
 import copy
diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py
index 5b6ec67e6b..5bc87a895d 100644
--- a/src/transformers/models/rag/tokenization_rag.py
+++ b/src/transformers/models/rag/tokenization_rag.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for RAG."""
+
 import os
 import warnings
 from typing import List, Optional
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index fd21f44a55..7e84f2916d 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" REALM model configuration."""
+"""REALM model configuration."""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index ff5e43bc13..7c5c344ae5 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch REALM model."""
+"""PyTorch REALM model."""
 
 import math
 import os
diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py
index f5a3f9673a..7f45a41710 100644
--- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RecurrentGemma model configuration"""
+"""RecurrentGemma model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index 06aef9a03d..c48132c83c 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch RecurrentGemma model."""
+"""PyTorch RecurrentGemma model."""
 
 import math
 from typing import Dict, Optional, Tuple, Union
diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py
index eecd67cc06..018831010b 100755
--- a/src/transformers/models/reformer/configuration_reformer.py
+++ b/src/transformers/models/reformer/configuration_reformer.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Reformer model configuration"""
+"""Reformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
index f25e166ef9..ad6a077581 100755
--- a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Reformer checkpoint."""
 
-
 import argparse
 import pickle
 
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index efc692185b..eb45749336 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model Reformer."""
-
+"""Tokenization class for model Reformer."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py
index fb0f2c8b8e..26f007a7f7 100644
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model Reformer."""
-
+"""Tokenization class for model Reformer."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/regnet/configuration_regnet.py b/src/transformers/models/regnet/configuration_regnet.py
index e24bc70a89..34f90ce184 100644
--- a/src/transformers/models/regnet/configuration_regnet.py
+++ b/src/transformers/models/regnet/configuration_regnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RegNet model configuration"""
+"""RegNet model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
index d29077c1a7..1f89b7bf8c 100644
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert RegNet checkpoints from timm and vissl."""
 
-
 import argparse
 import json
 from dataclasses import dataclass, field
diff --git a/src/transformers/models/regnet/modeling_regnet.py b/src/transformers/models/regnet/modeling_regnet.py
index df8f64a03b..2a348c792a 100644
--- a/src/transformers/models/regnet/modeling_regnet.py
+++ b/src/transformers/models/regnet/modeling_regnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch RegNet model."""
+"""PyTorch RegNet model."""
 
 from typing import Optional
 
diff --git a/src/transformers/models/regnet/modeling_tf_regnet.py b/src/transformers/models/regnet/modeling_tf_regnet.py
index 24ebb3f5ca..3d6b38b9e4 100644
--- a/src/transformers/models/regnet/modeling_tf_regnet.py
+++ b/src/transformers/models/regnet/modeling_tf_regnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow RegNet model."""
+"""TensorFlow RegNet model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py
index 471f2f7521..f9d28303fd 100644
--- a/src/transformers/models/rembert/configuration_rembert.py
+++ b/src/transformers/models/rembert/configuration_rembert.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RemBERT model configuration"""
+"""RemBERT model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py b/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
index 4c3d53e789..622d507080 100755
--- a/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/rembert/convert_rembert_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert RemBERT checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index e92418fcff..31f7e3dce4 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch RemBERT model."""
-
+"""PyTorch RemBERT model."""
 
 import math
 import os
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index daceef1080..5ee9ba1364 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 RemBERT model."""
-
+"""TF 2.0 RemBERT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py
index a2b1f9abc2..0c046b9bca 100644
--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for RemBERT."""
 
-
 import os
 from shutil import copyfile
 from typing import List, Optional, Tuple
diff --git a/src/transformers/models/rembert/tokenization_rembert_fast.py b/src/transformers/models/rembert/tokenization_rembert_fast.py
index b7165e362a..350e02e33b 100644
--- a/src/transformers/models/rembert/tokenization_rembert_fast.py
+++ b/src/transformers/models/rembert/tokenization_rembert_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for RemBERT model."""
-
+"""Tokenization classes for RemBERT model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/resnet/configuration_resnet.py b/src/transformers/models/resnet/configuration_resnet.py
index 46ccd96cd9..92fe656287 100644
--- a/src/transformers/models/resnet/configuration_resnet.py
+++ b/src/transformers/models/resnet/configuration_resnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ResNet model configuration"""
+"""ResNet model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
index 52b0bd9068..feceb74d16 100644
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ResNet checkpoints from timm."""
 
-
 import argparse
 import json
 from dataclasses import dataclass, field
diff --git a/src/transformers/models/resnet/modeling_resnet.py b/src/transformers/models/resnet/modeling_resnet.py
index 06af704603..c7cf0e03c7 100644
--- a/src/transformers/models/resnet/modeling_resnet.py
+++ b/src/transformers/models/resnet/modeling_resnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ResNet model."""
+"""PyTorch ResNet model."""
 
 from typing import Optional
 
diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py
index 4d68775c92..1e2ec143cd 100644
--- a/src/transformers/models/resnet/modeling_tf_resnet.py
+++ b/src/transformers/models/resnet/modeling_tf_resnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow ResNet model."""
+"""TensorFlow ResNet model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py
index 0ecd57b23a..d08f3df477 100644
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RoBERTa configuration"""
+"""RoBERTa configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index e4d95354ff..c0e6bf94d2 100644
--- a/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert RoBERTa checkpoint."""
 
-
 import argparse
 import pathlib
 
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index 84448a168d..439d12a870 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 RoBERTa model."""
-
+"""TF 2.0 RoBERTa model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py
index 702af8a33e..8384397033 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Tokenization classes for RoBERTa."""
+
 import json
 from typing import List, Optional, Tuple
 
diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
index e7e74b0cdf..e0c939f657 100644
--- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RoBERTa-PreLayerNorm configuration"""
+"""RoBERTa-PreLayerNorm configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
index 41fd14c5fd..b8491db08b 100644
--- a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert RoBERTa-PreLayerNorm checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
index c13778c1ac..c50227eaa2 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax RoBERTa-PreLayerNorm model."""
+"""Flax RoBERTa-PreLayerNorm model."""
+
 from typing import Callable, Optional, Tuple
 
 import flax.linen as nn
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
index beb9c383e1..1ecd376901 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 RoBERTa-PreLayerNorm model."""
-
+"""TF 2.0 RoBERTa-PreLayerNorm model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 752c791cf9..d402349e67 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RoCBert model configuration"""
+"""RoCBert model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 216aaf78b2..4c63d364ad 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch RoCBert model."""
+"""PyTorch RoCBert model."""
 
 import math
 import os
diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py
index 0732c3a9e0..ae4ed0fd7b 100644
--- a/src/transformers/models/roformer/configuration_roformer.py
+++ b/src/transformers/models/roformer/configuration_roformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RoFormer model configuration"""
+"""RoFormer model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
index 0ab8b671d0..d227948e0e 100755
--- a/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/roformer/convert_roformer_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert RoFormer checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/roformer/modeling_flax_roformer.py b/src/transformers/models/roformer/modeling_flax_roformer.py
index 6e154b311d..f53a056c13 100644
--- a/src/transformers/models/roformer/modeling_flax_roformer.py
+++ b/src/transformers/models/roformer/modeling_flax_roformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax RoFormer model."""
+"""Flax RoFormer model."""
 
 from typing import Callable, Optional, Tuple
 
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index 2911d98d31..69588ff743 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch RoFormer model."""
-
+"""PyTorch RoFormer model."""
 
 import math
 import os
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index e3f84cc78a..20af183691 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 RoFormer model."""
-
+"""TF 2.0 RoFormer model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py
index 1f073c03a5..cc161c1a26 100644
--- a/src/transformers/models/roformer/tokenization_roformer_fast.py
+++ b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for RoFormer."""
+
 import json
 from typing import List, Optional, Tuple
 
diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py
index 57b7412333..9539b857ea 100644
--- a/src/transformers/models/rwkv/configuration_rwkv.py
+++ b/src/transformers/models/rwkv/configuration_rwkv.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" RWKV configuration"""
+"""RWKV configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
index b340b9f028..44cf17b1cf 100644
--- a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
+++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""
 
-
 import argparse
 import gc
 import json
diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py
index 63dc5ff630..b0045655d2 100644
--- a/src/transformers/models/sam/configuration_sam.py
+++ b/src/transformers/models/sam/configuration_sam.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SAM model configuration"""
-
+"""SAM model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/sam/convert_sam_to_hf.py b/src/transformers/models/sam/convert_sam_to_hf.py
index be375494f0..dd8818b68c 100644
--- a/src/transformers/models/sam/convert_sam_to_hf.py
+++ b/src/transformers/models/sam/convert_sam_to_hf.py
@@ -19,6 +19,7 @@ URL: https://github.com/facebookresearch/segment-anything.
 
 Also supports converting the SlimSAM checkpoints from https://github.com/czg1225/SlimSAM/tree/master.
 """
+
 import argparse
 import re
 
diff --git a/src/transformers/models/sam/image_processing_sam.py b/src/transformers/models/sam/image_processing_sam.py
index ccdc72fc7b..99315858a3 100644
--- a/src/transformers/models/sam/image_processing_sam.py
+++ b/src/transformers/models/sam/image_processing_sam.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Image processor class for SAM."""
+
 import math
 from copy import deepcopy
 from itertools import product
diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py
index 0234572611..f5baf5bcf3 100644
--- a/src/transformers/models/sam/modeling_sam.py
+++ b/src/transformers/models/sam/modeling_sam.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SAM model."""
+"""PyTorch SAM model."""
 
 import collections
 import math
diff --git a/src/transformers/models/sam/modeling_tf_sam.py b/src/transformers/models/sam/modeling_tf_sam.py
index 5da1293e0f..1e5099f191 100644
--- a/src/transformers/models/sam/modeling_tf_sam.py
+++ b/src/transformers/models/sam/modeling_tf_sam.py
@@ -17,7 +17,6 @@ TensorFlow SAM model. This file was mostly generated by auto-translation from th
 discrepancy, the original file should be regarded as the 'reference' version.
 """
 
-
 from __future__ import annotations
 
 import collections
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index ed89ebeb3a..9e67be1e1e 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -15,6 +15,7 @@
 """
 Processor class for SAM.
 """
+
 from copy import deepcopy
 from typing import Optional, Union
 
diff --git a/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py b/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py
index 2075c65fa8..c24eb0ecb6 100644
--- a/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/configuration_seamless_m4t.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SeamlessM4T model configuration"""
+"""SeamlessM4T model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py b/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
index a90a30f579..b321af02e7 100644
--- a/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
+++ b/src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Converting Meta SeamlessM4T checkpoints from seamless_communication to HF."""
-
+"""Converting Meta SeamlessM4T checkpoints from seamless_communication to HF."""
 
 import argparse
 import os
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index b23b30056e..d35629938b 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SeamlessM4T model."""
-
+"""PyTorch SeamlessM4T model."""
 
 import copy
 import math
diff --git a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
index bb6beb760a..230283a0d4 100644
--- a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for SeamlessM4T."""
+
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
diff --git a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py
index a236db3cb5..70892c9948 100644
--- a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py
+++ b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Fast Tokenization class for SeamlessM4T."""
+
 import os
 from shutil import copyfile
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
index 051a86cba1..30082cd5fd 100644
--- a/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SeamlessM4Tv2 model configuration"""
+"""SeamlessM4Tv2 model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py b/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
index 8d4320cff5..97a633d05a 100644
--- a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
+++ b/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
-
+"""Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
 
 import argparse
 import os
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index 526b697300..a824ff8f24 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SeamlessM4Tv2 model."""
-
+"""PyTorch SeamlessM4Tv2 model."""
 
 import copy
 import math
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index 6aadb64b6f..28fc1a7334 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SegFormer model configuration"""
+"""SegFormer model configuration"""
 
 import warnings
 from collections import OrderedDict
diff --git a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
index 48dd453309..dbac5ab6b8 100644
--- a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
+++ b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert SegFormer checkpoints."""
 
-
 import argparse
 import json
 from collections import OrderedDict
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index d47219a0bd..2a2e544c48 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SegFormer model."""
-
+"""PyTorch SegFormer model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 0657f1b437..0e5fc6d0be 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow SegFormer model."""
-
+"""TensorFlow SegFormer model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/seggpt/configuration_seggpt.py b/src/transformers/models/seggpt/configuration_seggpt.py
index bac482e97f..f79e7f12b2 100644
--- a/src/transformers/models/seggpt/configuration_seggpt.py
+++ b/src/transformers/models/seggpt/configuration_seggpt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SegGpt model configuration"""
-
+"""SegGpt model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/seggpt/convert_seggpt_to_hf.py b/src/transformers/models/seggpt/convert_seggpt_to_hf.py
index a13372dfbb..d67daeab93 100644
--- a/src/transformers/models/seggpt/convert_seggpt_to_hf.py
+++ b/src/transformers/models/seggpt/convert_seggpt_to_hf.py
@@ -17,7 +17,6 @@
 URL: https://github.com/baaivision/Painter/tree/main/SegGPT
 """
 
-
 import argparse
 
 import requests
diff --git a/src/transformers/models/seggpt/modeling_seggpt.py b/src/transformers/models/seggpt/modeling_seggpt.py
index 65672e4213..b84fd8c9d2 100644
--- a/src/transformers/models/seggpt/modeling_seggpt.py
+++ b/src/transformers/models/seggpt/modeling_seggpt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SegGpt model."""
-
+"""PyTorch SegGpt model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
index 33ea6d374f..6c877277ae 100644
--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SEW model configuration"""
+"""SEW model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
index 81c3284af8..df0cae2a3b 100644
--- a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert SEW checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 199d0e543a..ea10b25818 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SEW model."""
+"""PyTorch SEW model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py
index 7a38815b47..ea791935ba 100644
--- a/src/transformers/models/sew_d/configuration_sew_d.py
+++ b/src/transformers/models/sew_d/configuration_sew_d.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SEW-D model configuration"""
+"""SEW-D model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
index 7844d7912f..1540efa4be 100644
--- a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert SEW checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 352a86c9c2..2daa6739ef 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SEW model."""
+"""PyTorch SEW model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py
index 7692f79abb..ca26919b7e 100644
--- a/src/transformers/models/siglip/configuration_siglip.py
+++ b/src/transformers/models/siglip/configuration_siglip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Siglip model configuration"""
+"""Siglip model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/siglip/convert_siglip_to_hf.py b/src/transformers/models/siglip/convert_siglip_to_hf.py
index 6adacef84f..163f6f2797 100644
--- a/src/transformers/models/siglip/convert_siglip_to_hf.py
+++ b/src/transformers/models/siglip/convert_siglip_to_hf.py
@@ -17,7 +17,6 @@
 URL: https://github.com/google-research/big_vision/tree/main
 """
 
-
 import argparse
 import collections
 from pathlib import Path
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index 87fae3493b..d6607f9ef7 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Siglip model."""
-
+"""PyTorch Siglip model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/siglip/tokenization_siglip.py b/src/transformers/models/siglip/tokenization_siglip.py
index 41277320a3..6203c68870 100644
--- a/src/transformers/models/siglip/tokenization_siglip.py
+++ b/src/transformers/models/siglip/tokenization_siglip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for SigLIP model."""
+"""Tokenization class for SigLIP model."""
 
 import os
 import re
diff --git a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
index 89690a5729..874aa2e066 100644
--- a/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
+++ b/src/transformers/models/speech_encoder_decoder/convert_mbart_wav2vec2_seq2seq_original_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Wav2Vec2 checkpoint."""
 
-
 import argparse
 
 import fairseq
diff --git a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py b/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
index 5e726aa9fd..3772889820 100644
--- a/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
+++ b/src/transformers/models/speech_encoder_decoder/convert_speech_to_text_wav2vec2_seq2seq_original_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Wav2Vec2 checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index e3bbd86266..2a15714cff 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support Flax Speech-Encoder-Decoder architectures"""
+"""Classes to support Flax Speech-Encoder-Decoder architectures"""
 
 import os
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 77b69afe8f..d164622cd6 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support Speech-Encoder-Text-Decoder architectures"""
-
+"""Classes to support Speech-Encoder-Text-Decoder architectures"""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
index 2b8e3bd22e..80602e9a7d 100644
--- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Speech2Text model configuration"""
+"""Speech2Text model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 155d6a5dfe..8353a172b2 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Speech2Text model."""
+"""PyTorch Speech2Text model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 91e6028332..bac1256ca4 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow Speech2Text model."""
-
+"""TensorFlow Speech2Text model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index 42e9006338..646b389994 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for Speech2Text
 """
+
 import warnings
 from contextlib import contextmanager
 
diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
index 27db0a671e..1b9841f0cf 100644
--- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for Speech2Text."""
+
 import json
 import os
 from pathlib import Path
diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
index 5c9ebbe00f..bcc92a7bd2 100644
--- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Speech2Text model configuration"""
+"""Speech2Text model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index 20f8555bd9..35305408e6 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Speech2Text2 model."""
-
+"""PyTorch Speech2Text2 model."""
 
 import copy
 import math
diff --git a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
index 47a45d700f..9e0881d89d 100644
--- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for Speech2Text2
 """
+
 import warnings
 from contextlib import contextmanager
 
diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py
index 9188325303..d8f4497de8 100644
--- a/src/transformers/models/speecht5/configuration_speecht5.py
+++ b/src/transformers/models/speecht5/configuration_speecht5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SpeechT5 model configuration"""
+"""SpeechT5 model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index 175b4b6e79..22ee648b97 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SpeechT5 model."""
+"""PyTorch SpeechT5 model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py
index 41cb296f8f..97b2feaab3 100644
--- a/src/transformers/models/speecht5/tokenization_speecht5.py
+++ b/src/transformers/models/speecht5/tokenization_speecht5.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization class for SpeechT5."""
 
-
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/src/transformers/models/splinter/configuration_splinter.py b/src/transformers/models/splinter/configuration_splinter.py
index 83e78e4e4a..9a946fd4be 100644
--- a/src/transformers/models/splinter/configuration_splinter.py
+++ b/src/transformers/models/splinter/configuration_splinter.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Splinter model configuration"""
+"""Splinter model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index f3f470258d..6494a57fa4 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Splinter model."""
-
+"""PyTorch Splinter model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/squeezebert/configuration_squeezebert.py b/src/transformers/models/squeezebert/configuration_squeezebert.py
index ec79988849..1f3753ac5c 100644
--- a/src/transformers/models/squeezebert/configuration_squeezebert.py
+++ b/src/transformers/models/squeezebert/configuration_squeezebert.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SqueezeBERT model configuration"""
+"""SqueezeBERT model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index cd5dcb0842..483bac01bd 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SqueezeBert model."""
-
+"""PyTorch SqueezeBert model."""
 
 import math
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py
index 64b39fe20e..da7086ba74 100644
--- a/src/transformers/models/stablelm/configuration_stablelm.py
+++ b/src/transformers/models/stablelm/configuration_stablelm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" StableLM model configuration """
+"""StableLM model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index 5b81abc693..df4e922c5a 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch StableLM model."""
+"""PyTorch StableLM model."""
+
 import math
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py
index 3bb0d1b655..91f9e94983 100644
--- a/src/transformers/models/starcoder2/configuration_starcoder2.py
+++ b/src/transformers/models/starcoder2/configuration_starcoder2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Starcoder2 model configuration"""
+"""Starcoder2 model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py
index e0abef4b41..32e5998884 100644
--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -17,7 +17,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Starcoder2 model."""
+"""PyTorch Starcoder2 model."""
+
 import inspect
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py
index 87473480d7..cfd3dfd86e 100644
--- a/src/transformers/models/superpoint/modeling_superpoint.py
+++ b/src/transformers/models/superpoint/modeling_superpoint.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PyTorch SuperPoint model."""
+
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/swiftformer/configuration_swiftformer.py b/src/transformers/models/swiftformer/configuration_swiftformer.py
index be95094bac..abfdf51652 100644
--- a/src/transformers/models/swiftformer/configuration_swiftformer.py
+++ b/src/transformers/models/swiftformer/configuration_swiftformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" SwiftFormer model configuration"""
+"""SwiftFormer model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
index f6cabb34b6..21ecebebe2 100644
--- a/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
+++ b/src/transformers/models/swiftformer/convert_swiftformer_original_to_hf.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert SwiftFormer checkpoints from the original implementation."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/swiftformer/modeling_swiftformer.py b/src/transformers/models/swiftformer/modeling_swiftformer.py
index 68a0ee53b0..bd86c3d717 100644
--- a/src/transformers/models/swiftformer/modeling_swiftformer.py
+++ b/src/transformers/models/swiftformer/modeling_swiftformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SwiftFormer model."""
-
+"""PyTorch SwiftFormer model."""
 
 import collections.abc
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/swiftformer/modeling_tf_swiftformer.py b/src/transformers/models/swiftformer/modeling_tf_swiftformer.py
index 271bd5e280..3f1d19e9e3 100644
--- a/src/transformers/models/swiftformer/modeling_tf_swiftformer.py
+++ b/src/transformers/models/swiftformer/modeling_tf_swiftformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow SwiftFormer model."""
-
+"""TensorFlow SwiftFormer model."""
 
 import collections.abc
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 281d0f047b..321648f149 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Swin Transformer model configuration"""
+"""Swin Transformer model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
index 156b0ba86c..6402346289 100644
--- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
@@ -95,15 +95,15 @@ def convert_state_dict(orig_state_dict, model):
             dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
 
             if "weight" in key:
-                orig_state_dict[
-                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
+                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = (
+                    val[:dim, :]
+                )
                 orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[
                     dim : dim * 2, :
                 ]
-                orig_state_dict[
-                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
+                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = (
+                    val[-dim:, :]
+                )
             else:
                 orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[
                     :dim
diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
index 828237490e..c91249b272 100644
--- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
@@ -102,15 +102,15 @@ def convert_state_dict(orig_state_dict, model):
             dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
 
             if "weight" in key:
-                orig_state_dict[
-                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
-                ] = val[:dim, :]
+                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"] = (
+                    val[:dim, :]
+                )
                 orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = val[
                     dim : dim * 2, :
                 ]
-                orig_state_dict[
-                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
-                ] = val[-dim:, :]
+                orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"] = (
+                    val[-dim:, :]
+                )
             else:
                 orig_state_dict[f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = val[
                     :dim
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 9f64f8009a..13c20f59e9 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Swin Transformer model."""
-
+"""PyTorch Swin Transformer model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index 99da3d7f1e..035b31e8d4 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 Swin Transformer model."""
-
+"""TF 2.0 Swin Transformer model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/swin2sr/configuration_swin2sr.py b/src/transformers/models/swin2sr/configuration_swin2sr.py
index 98177a804a..0d910e89e4 100644
--- a/src/transformers/models/swin2sr/configuration_swin2sr.py
+++ b/src/transformers/models/swin2sr/configuration_swin2sr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Swin2SR Transformer model configuration"""
+"""Swin2SR Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
index 6884bf0afc..f053128339 100644
--- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
+++ b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
@@ -137,22 +137,22 @@ def convert_state_dict(orig_state_dict, config):
                 orig_state_dict[
                     f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.weight"
                 ] = val[:dim, :]
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight"
-                ] = val[dim : dim * 2, :]
+                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight"] = (
+                    val[dim : dim * 2, :]
+                )
                 orig_state_dict[
                     f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.weight"
                 ] = val[-dim:, :]
             else:
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias"
-                ] = val[:dim]
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias"
-                ] = val[dim : dim * 2]
-                orig_state_dict[
-                    f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias"
-                ] = val[-dim:]
+                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias"] = (
+                    val[:dim]
+                )
+                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias"] = (
+                    val[dim : dim * 2]
+                )
+                orig_state_dict[f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias"] = (
+                    val[-dim:]
+                )
             pass
         else:
             orig_state_dict[rename_key(key, config)] = val
diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py
index 96b67da51e..64fad4d9b2 100644
--- a/src/transformers/models/swin2sr/modeling_swin2sr.py
+++ b/src/transformers/models/swin2sr/modeling_swin2sr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Swin2SR Transformer model."""
-
+"""PyTorch Swin2SR Transformer model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py
index 17e924804c..c6032c45df 100644
--- a/src/transformers/models/swinv2/configuration_swinv2.py
+++ b/src/transformers/models/swinv2/configuration_swinv2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Swinv2 Transformer model configuration"""
+"""Swinv2 Transformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
index 21deda864c..0e6e837a7e 100644
--- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+++ b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
@@ -145,22 +145,22 @@ def convert_state_dict(orig_state_dict, model):
                 orig_state_dict[
                     f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                 ] = val[:dim, :]
-                orig_state_dict[
-                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
-                ] = val[dim : dim * 2, :]
+                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"] = (
+                    val[dim : dim * 2, :]
+                )
                 orig_state_dict[
                     f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                 ] = val[-dim:, :]
             else:
-                orig_state_dict[
-                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
-                ] = val[:dim]
+                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"] = (
+                    val[:dim]
+                )
                 orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"] = val[
                     dim : dim * 2
                 ]
-                orig_state_dict[
-                    f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
-                ] = val[-dim:]
+                orig_state_dict[f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"] = (
+                    val[-dim:]
+                )
         else:
             orig_state_dict[rename_key(key)] = val
 
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index 48a4c65722..b0682eacaa 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Swinv2 Transformer model."""
-
+"""PyTorch Swinv2 Transformer model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/switch_transformers/configuration_switch_transformers.py b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
index fe96297777..5ed95f2b61 100644
--- a/src/transformers/models/switch_transformers/configuration_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Switch Transformers model configuration"""
+"""Switch Transformers model configuration"""
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 395d055177..3701b30a22 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch SwitchTransformers model."""
-
+"""PyTorch SwitchTransformers model."""
 
 import copy
 import math
diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py
index eec47e5eb2..e5f2615611 100644
--- a/src/transformers/models/t5/configuration_t5.py
+++ b/src/transformers/models/t5/configuration_t5.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" T5 model configuration"""
+"""T5 model configuration"""
+
 from typing import Mapping
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
index 7d9a20f3b0..9b1b15857c 100755
--- a/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/t5/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert T5 checkpoint."""
 
-
 import argparse
 
 from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
index 11f32c8461..91ac9f08a0 100644
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
+++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
@@ -54,22 +54,22 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
         t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
 
         # Assigning
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"][
-            "kernel"
-        ] = t5x_attention_key
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"][
-            "kernel"
-        ] = t5x_attention_out
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"][
-            "kernel"
-        ] = t5x_attention_query
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"][
-            "kernel"
-        ] = t5x_attention_value
+        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = (
+            t5x_attention_key
+        )
+        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = (
+            t5x_attention_out
+        )
+        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = (
+            t5x_attention_query
+        )
+        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = (
+            t5x_attention_value
+        )
 
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"][
-            "weight"
-        ] = t5x_attention_layer_norm
+        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = (
+            t5x_attention_layer_norm
+        )
 
         if split_mlp_wi:
             flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi_0"][
@@ -79,16 +79,16 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
                 "kernel"
             ] = t5x_mlp_wi_1
         else:
-            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi"][
-                "kernel"
-            ] = t5x_mlp_wi
+            flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wi"]["kernel"] = (
+                t5x_mlp_wi
+            )
 
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wo"][
-            "kernel"
-        ] = t5x_mlp_wo
-        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"][
-            "weight"
-        ] = t5x_mlp_layer_norm
+        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["DenseReluDense"]["wo"]["kernel"] = (
+            t5x_mlp_wo
+        )
+        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = (
+            t5x_mlp_layer_norm
+        )
 
     # Only for layer 0:
     t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"]["rel_embedding"].T
@@ -145,39 +145,39 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
         tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name]["pre_mlp_layer_norm"]["scale"]
 
         # Assigning
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"][
-            "kernel"
-        ] = t5x_attention_key
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"][
-            "kernel"
-        ] = t5x_attention_out
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"][
-            "kernel"
-        ] = t5x_attention_query
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"][
-            "kernel"
-        ] = t5x_attention_value
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["k"]["kernel"] = (
+            t5x_attention_key
+        )
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["o"]["kernel"] = (
+            t5x_attention_out
+        )
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["q"]["kernel"] = (
+            t5x_attention_query
+        )
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["SelfAttention"]["v"]["kernel"] = (
+            t5x_attention_value
+        )
 
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"][
-            "weight"
-        ] = t5x_pre_attention_layer_norm
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"]["layer_norm"]["weight"] = (
+            t5x_pre_attention_layer_norm
+        )
 
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["k"][
-            "kernel"
-        ] = t5x_enc_dec_attention_key
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["o"][
-            "kernel"
-        ] = t5x_enc_dec_attention_out
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["q"][
-            "kernel"
-        ] = t5x_enc_dec_attention_query
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["v"][
-            "kernel"
-        ] = t5x_enc_dec_attention_value
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["k"]["kernel"] = (
+            t5x_enc_dec_attention_key
+        )
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["o"]["kernel"] = (
+            t5x_enc_dec_attention_out
+        )
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["q"]["kernel"] = (
+            t5x_enc_dec_attention_query
+        )
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["EncDecAttention"]["v"]["kernel"] = (
+            t5x_enc_dec_attention_value
+        )
 
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"][
-            "weight"
-        ] = t5x_cross_layer_norm
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"]["layer_norm"]["weight"] = (
+            t5x_cross_layer_norm
+        )
 
         if split_mlp_wi:
             flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi_0"][
@@ -187,17 +187,17 @@ def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name, flax_dump_f
                 "kernel"
             ] = t5x_mlp_wi_1
         else:
-            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi"][
-                "kernel"
-            ] = t5x_mlp_wi
+            flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wi"]["kernel"] = (
+                t5x_mlp_wi
+            )
 
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wo"][
-            "kernel"
-        ] = t5x_mlp_wo
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["DenseReluDense"]["wo"]["kernel"] = (
+            t5x_mlp_wo
+        )
 
-        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["layer_norm"][
-            "weight"
-        ] = tx5_mlp_layer_norm
+        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"]["layer_norm"]["weight"] = (
+            tx5_mlp_layer_norm
+        )
 
     # Decoder Normalization
     tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"]
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 94b24bd42f..be5ffd4489 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax T5 model."""
-
+"""Flax T5 model."""
 
 import copy
 from typing import Callable, Optional, Tuple
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 81dff54f99..224769fdfe 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch T5 model."""
-
+"""PyTorch T5 model."""
 
 import copy
 import math
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index d7a9c0a17d..6cd44766bf 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 T5 model."""
-
+"""TF 2.0 T5 model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 7292808adc..0f2ae101c8 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model T5."""
-
+"""Tokenization class for model T5."""
 
 import os
 import re
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index e9f2033812..0a92803f16 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for model T5."""
-
+"""Tokenization class for model T5."""
 
 import os
 import re
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index 8553ad1a65..f6647c5a4e 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Table Transformer model configuration"""
+"""Table Transformer model configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
index d06c3eb26b..487cdc4819 100644
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
+++ b/src/transformers/models/table_transformer/convert_table_transformer_to_hf.py
@@ -17,7 +17,6 @@
 URL: https://github.com/microsoft/table-transformer
 """
 
-
 import argparse
 from collections import OrderedDict
 from pathlib import Path
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py b/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
index 0a2b7b87fe..1073d48877 100644
--- a/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
+++ b/src/transformers/models/table_transformer/convert_table_transformer_to_hf_no_timm.py
@@ -17,7 +17,6 @@
 URL: https://github.com/microsoft/table-transformer
 """
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 3bfdde8907..a8c2593752 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Table Transformer model."""
-
+"""PyTorch Table Transformer model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py
index cbc5cebf4e..63d289e38f 100644
--- a/src/transformers/models/tapas/configuration_tapas.py
+++ b/src/transformers/models/tapas/configuration_tapas.py
@@ -22,7 +22,6 @@ Hyperparameters are taken from run_task_main.py and hparam_utils.py of the origi
 
 """
 
-
 from ...configuration_utils import PretrainedConfig
 
 
diff --git a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
index 2772a7f126..34bf77cccd 100644
--- a/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/tapas/convert_tapas_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert TAPAS checkpoint."""
 
-
 import argparse
 
 from transformers import (
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 903fa2a225..a06770778e 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch TAPAS model."""
 
-
 import enum
 import math
 import os
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index b26803cecb..3515dfe655 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """TF 2.0 TAPAS model."""
 
-
 from __future__ import annotations
 
 import enum
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 23fbd5300e..529ecb444e 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization class for TAPAS model."""
-
+"""Tokenization class for TAPAS model."""
 
 import collections
 import datetime
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 8c74f15174..56b06c0384 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Time Series Transformer model configuration"""
+"""Time Series Transformer model configuration"""
 
 from typing import List, Optional, Union
 
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index dd7a2228a3..b45e6d7e85 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Time Series Transformer model."""
+"""PyTorch Time Series Transformer model."""
 
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py
index ebcfcc8248..2ee7125de2 100644
--- a/src/transformers/models/timesformer/configuration_timesformer.py
+++ b/src/transformers/models/timesformer/configuration_timesformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TimeSformer model configuration"""
+"""TimeSformer model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py
index f85e945247..2262898d54 100644
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch TimeSformer model."""
-
+"""PyTorch TimeSformer model."""
 
 import collections
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/timm_backbone/configuration_timm_backbone.py b/src/transformers/models/timm_backbone/configuration_timm_backbone.py
index 0f2f1b0b6c..6171bcce1e 100644
--- a/src/transformers/models/timm_backbone/configuration_timm_backbone.py
+++ b/src/transformers/models/timm_backbone/configuration_timm_backbone.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Configuration for Backbone models"""
+"""Configuration for Backbone models"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/trocr/configuration_trocr.py b/src/transformers/models/trocr/configuration_trocr.py
index efa20d884e..f47412e93a 100644
--- a/src/transformers/models/trocr/configuration_trocr.py
+++ b/src/transformers/models/trocr/configuration_trocr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TrOCR model configuration"""
+"""TrOCR model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
index 428406d82c..a787932b76 100644
--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert TrOCR checkpoints from the unilm repository."""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index 1a0766fc25..04eb40ab2a 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch TrOCR decoder model (based on RoBERTa)."""
-
+"""PyTorch TrOCR decoder model (based on RoBERTa)."""
 
 import copy
 import math
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index e7ce7362d4..b0d2e823fe 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -15,6 +15,7 @@
 """
 Processor class for TrOCR.
 """
+
 import warnings
 from contextlib import contextmanager
 
diff --git a/src/transformers/models/tvlt/configuration_tvlt.py b/src/transformers/models/tvlt/configuration_tvlt.py
index fbf65effd9..1a1782f68c 100644
--- a/src/transformers/models/tvlt/configuration_tvlt.py
+++ b/src/transformers/models/tvlt/configuration_tvlt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TVLT model configuration"""
+"""TVLT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/tvlt/image_processing_tvlt.py b/src/transformers/models/tvlt/image_processing_tvlt.py
index f13101c15a..06576a0f7e 100644
--- a/src/transformers/models/tvlt/image_processing_tvlt.py
+++ b/src/transformers/models/tvlt/image_processing_tvlt.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Image processor class for TVLT."""
+
 from typing import Dict, List, Optional, Union
 
 import numpy as np
diff --git a/src/transformers/models/tvlt/modeling_tvlt.py b/src/transformers/models/tvlt/modeling_tvlt.py
index 0376570fe5..d49fef5822 100644
--- a/src/transformers/models/tvlt/modeling_tvlt.py
+++ b/src/transformers/models/tvlt/modeling_tvlt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch TVLT model."""
-
+"""PyTorch TVLT model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
index 65c4e3a522..dfd3a04ee7 100644
--- a/src/transformers/models/tvp/configuration_tvp.py
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TVP model configuration"""
+"""TVP model configuration"""
 
 import copy
 
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index 4e27399ab8..eb8aabfdad 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -16,7 +16,6 @@
 Processor class for TVP.
 """
 
-
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 
diff --git a/src/transformers/models/udop/configuration_udop.py b/src/transformers/models/udop/configuration_udop.py
index 3802b6de01..bc1765e289 100644
--- a/src/transformers/models/udop/configuration_udop.py
+++ b/src/transformers/models/udop/configuration_udop.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" UDOP model configuration"""
-
+"""UDOP model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/udop/convert_udop_to_hf.py b/src/transformers/models/udop/convert_udop_to_hf.py
index 7cbb2f161d..f2d54b8ca5 100644
--- a/src/transformers/models/udop/convert_udop_to_hf.py
+++ b/src/transformers/models/udop/convert_udop_to_hf.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert UDOP checkpoints from the original repository. URL: https://github.com/microsoft/i-Code/tree/main/i-Code-Doc"""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py
index 0a2d003a56..02e7aac88b 100644
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch UDOP model."""
+"""PyTorch UDOP model."""
 
 import collections
 import logging
diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py
index 3eaf1f5cad..704b5c48de 100644
--- a/src/transformers/models/udop/tokenization_udop.py
+++ b/src/transformers/models/udop/tokenization_udop.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for UDOP model."""
-
+"""Tokenization classes for UDOP model."""
 
 import os
 import re
diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py
index db3b90e05c..caa96c25f4 100644
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ b/src/transformers/models/udop/tokenization_udop_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for UDOP model."""
-
+"""Tokenization classes for UDOP model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/umt5/configuration_umt5.py b/src/transformers/models/umt5/configuration_umt5.py
index 9365717c28..d7323d759f 100644
--- a/src/transformers/models/umt5/configuration_umt5.py
+++ b/src/transformers/models/umt5/configuration_umt5.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" UMT5 model configuration"""
+"""UMT5 model configuration"""
+
 from typing import Mapping
 
 from ...configuration_utils import PretrainedConfig
diff --git a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
index eeb5b3eb40..848ca3c566 100644
--- a/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
+++ b/src/transformers/models/umt5/convert_umt5_checkpoint_to_pytorch.py
@@ -166,9 +166,9 @@ def convert_t5x_to_pytorch(
 
             if scalable_attention:
                 # convert the rel_embedding of each layer
-                new[
-                    f"decoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"
-                ] = t5x_relpos_bias_lookup(old, i, "decoder").T
+                new[f"decoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = (
+                    t5x_relpos_bias_lookup(old, i, "decoder").T
+                )
 
         new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]
 
diff --git a/src/transformers/models/umt5/modeling_umt5.py b/src/transformers/models/umt5/modeling_umt5.py
index 1bf8469f77..3271689540 100644
--- a/src/transformers/models/umt5/modeling_umt5.py
+++ b/src/transformers/models/umt5/modeling_umt5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch UMT5 model."""
+"""PyTorch UMT5 model."""
 
 import copy
 import math
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index 18502adcb0..69bc162220 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" UniSpeech model configuration"""
+"""UniSpeech model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
index bf72930951..4eb8dfa7bb 100644
--- a/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert UniSpeech checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index aa4bb7827e..0db34c1fc8 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch UniSpeech model."""
+"""PyTorch UniSpeech model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index 87b4bc8506..85661b02b6 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" UniSpeechSat model configuration"""
+"""UniSpeechSat model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
index 56c9d52e18..fca35acb63 100644
--- a/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
+++ b/src/transformers/models/unispeech_sat/convert_unispeech_original_s3prl_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Hubert checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
index 93750b64cc..4a70d41dd2 100644
--- a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert UniSpeechSat checkpoint."""
 
-
 import argparse
 
 import fairseq
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 4663fc05d8..b78ffb4ac1 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch UniSpeechSat model."""
+"""PyTorch UniSpeechSat model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/univnet/configuration_univnet.py b/src/transformers/models/univnet/configuration_univnet.py
index 27850e114d..0f4dceb479 100644
--- a/src/transformers/models/univnet/configuration_univnet.py
+++ b/src/transformers/models/univnet/configuration_univnet.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" UnivNetModel model configuration"""
+"""UnivNetModel model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py
index e4fc1215c0..5b0c659c30 100644
--- a/src/transformers/models/univnet/modeling_univnet.py
+++ b/src/transformers/models/univnet/modeling_univnet.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch UnivNetModel model."""
+"""PyTorch UnivNetModel model."""
 
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 609818c80d..8d9ba47777 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" UperNet model configuration"""
-
+"""UperNet model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/upernet/modeling_upernet.py b/src/transformers/models/upernet/modeling_upernet.py
index 58f64995ae..4a5b915e57 100644
--- a/src/transformers/models/upernet/modeling_upernet.py
+++ b/src/transformers/models/upernet/modeling_upernet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation."""
+"""PyTorch UperNet model. Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation."""
 
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py
index d05f0e51e1..9fd236e595 100644
--- a/src/transformers/models/video_llava/configuration_video_llava.py
+++ b/src/transformers/models/video_llava/configuration_video_llava.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VideoLlava model configuration"""
+"""VideoLlava model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 7fbd142fbe..1059479334 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch VideoLlava model."""
+"""PyTorch VideoLlava model."""
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index b71600501b..e29c3478aa 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -16,7 +16,6 @@
 Processor class for VideoLlava.
 """
 
-
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index b1cfcaecfa..a6150f3433 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VideoMAE model configuration"""
+"""VideoMAE model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 05f74328b4..73a680ba3a 100755
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch VideoMAE (masked autoencoder) model."""
-
+"""PyTorch VideoMAE (masked autoencoder) model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/vilt/configuration_vilt.py b/src/transformers/models/vilt/configuration_vilt.py
index ef0ce550d2..a57b400682 100644
--- a/src/transformers/models/vilt/configuration_vilt.py
+++ b/src/transformers/models/vilt/configuration_vilt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VilT model configuration"""
+"""VilT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
index e597d0d7e7..79b9f3ba03 100644
--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ViLT checkpoints from the original Github repository."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 4395bb47b5..bb564c1ceb 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViLT model."""
+"""PyTorch ViLT model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py
index e94d4be6c1..d98099b21b 100644
--- a/src/transformers/models/vipllava/configuration_vipllava.py
+++ b/src/transformers/models/vipllava/configuration_vipllava.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VipLlava model configuration"""
+"""VipLlava model configuration"""
 
 import warnings
 
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 02821b03a2..c5f856e787 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch VipLlava model."""
+"""PyTorch VipLlava model."""
+
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 987c9a1afa..1af006a352 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support Vision-Encoder-Text-Decoder architectures"""
-
+"""Classes to support Vision-Encoder-Text-Decoder architectures"""
 
 import os
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 75ff2dbd82..383dd0e3e4 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support TF Vision-Encoder-Text-Decoder architectures"""
-
+"""Classes to support TF Vision-Encoder-Text-Decoder architectures"""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 1dbee5efcc..b6125fb4db 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Classes to support Vision-Encoder-Text-Decoder architectures"""
-
+"""Classes to support Vision-Encoder-Text-Decoder architectures"""
 
 import gc
 import os
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index aab76f71db..a3191464ec 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VisionTextDualEncoder model configuration"""
-
+"""VisionTextDualEncoder model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index ba8bf7091b..23244af1e3 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax VisionTextDualEncoder model."""
-
+"""Flax VisionTextDualEncoder model."""
 
 from typing import Optional, Tuple
 
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index 6f7e30d3f6..077b452f70 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """TensorFlow VisionTextDualEncoder model."""
 
-
 from __future__ import annotations
 
 import re
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 0f82bdd0c3..5b90faa886 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch VisionTextDualEncoder model."""
-
+"""PyTorch VisionTextDualEncoder model."""
 
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py
index bb146a143a..ae98229a7d 100644
--- a/src/transformers/models/visual_bert/configuration_visual_bert.py
+++ b/src/transformers/models/visual_bert/configuration_visual_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VisualBERT model configuration"""
+"""VisualBERT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
index d1e95630bd..dd03623c88 100644
--- a/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/visual_bert/convert_visual_bert_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert VisualBert checkpoint."""
 
-
 import argparse
 from collections import OrderedDict
 from pathlib import Path
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index f482ed8cd7..41c8709b3e 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch VisualBERT model."""
-
+"""PyTorch VisualBERT model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index 286d302c78..bacec851a9 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViT model configuration"""
+"""ViT model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/vit/convert_dino_to_pytorch.py b/src/transformers/models/vit/convert_dino_to_pytorch.py
index 7eec823ad5..8608da8eb4 100644
--- a/src/transformers/models/vit/convert_dino_to_pytorch.py
+++ b/src/transformers/models/vit/convert_dino_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ViT checkpoints trained with the DINO method."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index 0ccd9b9f66..7892842f8d 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ViT and non-distilled DeiT checkpoints from the timm library."""
 
-
 import argparse
 from pathlib import Path
 
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index ac5cf691e9..2cf120df74 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ViT model."""
-
+"""TF 2.0 ViT model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index dfda7bf731..7897555a6b 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViT model."""
-
+"""PyTorch ViT model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index 2b9dcd0a81..78349af336 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViT Hybrid model configuration"""
-
+"""ViT Hybrid model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index e88ee246ba..1d717d74c9 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert ViT hybrid checkpoints from the timm library."""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 359b5e3fb9..9c92e592c1 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViT Hybrid model."""
-
+"""PyTorch ViT Hybrid model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/vit_mae/configuration_vit_mae.py b/src/transformers/models/vit_mae/configuration_vit_mae.py
index e4e46e7e42..d20b5af130 100644
--- a/src/transformers/models/vit_mae/configuration_vit_mae.py
+++ b/src/transformers/models/vit_mae/configuration_vit_mae.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViT MAE model configuration"""
+"""ViT MAE model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index fff8234e06..cc7a29b192 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 ViT MAE (masked autoencoder) model."""
-
+"""TF 2.0 ViT MAE (masked autoencoder) model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index f434efbe3e..f2afd84fc1 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViT MAE (masked autoencoder) model."""
-
+"""PyTorch ViT MAE (masked autoencoder) model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/vit_msn/configuration_vit_msn.py b/src/transformers/models/vit_msn/configuration_vit_msn.py
index 14acb15d54..7cf4414f8d 100644
--- a/src/transformers/models/vit_msn/configuration_vit_msn.py
+++ b/src/transformers/models/vit_msn/configuration_vit_msn.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViT MSN model configuration"""
-
+"""ViT MSN model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index fe5a880027..c89370be5c 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViT MSN (masked siamese network) model."""
-
+"""PyTorch ViT MSN (masked siamese network) model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/vitdet/configuration_vitdet.py b/src/transformers/models/vitdet/configuration_vitdet.py
index f85558c254..856f228a5b 100644
--- a/src/transformers/models/vitdet/configuration_vitdet.py
+++ b/src/transformers/models/vitdet/configuration_vitdet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VitDet model configuration"""
-
+"""VitDet model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py
index 9dc3e476f3..edf934581c 100644
--- a/src/transformers/models/vitdet/modeling_vitdet.py
+++ b/src/transformers/models/vitdet/modeling_vitdet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViTDet backbone."""
-
+"""PyTorch ViTDet backbone."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py
index 67f562d2bd..0083bc1ef2 100644
--- a/src/transformers/models/vitmatte/configuration_vitmatte.py
+++ b/src/transformers/models/vitmatte/configuration_vitmatte.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VitMatte model configuration"""
+"""VitMatte model configuration"""
 
 import copy
 from typing import List
diff --git a/src/transformers/models/vitmatte/modeling_vitmatte.py b/src/transformers/models/vitmatte/modeling_vitmatte.py
index f7bfb06d9a..fca94e7032 100644
--- a/src/transformers/models/vitmatte/modeling_vitmatte.py
+++ b/src/transformers/models/vitmatte/modeling_vitmatte.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViTMatte model."""
+"""PyTorch ViTMatte model."""
 
 from dataclasses import dataclass
 from typing import Optional, Tuple
diff --git a/src/transformers/models/vits/configuration_vits.py b/src/transformers/models/vits/configuration_vits.py
index 8d5ffca36f..0f2aeb8ac4 100644
--- a/src/transformers/models/vits/configuration_vits.py
+++ b/src/transformers/models/vits/configuration_vits.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" VITS model configuration"""
-
+"""VITS model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py
index 905945e01a..def1361cc0 100644
--- a/src/transformers/models/vits/modeling_vits.py
+++ b/src/transformers/models/vits/modeling_vits.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch VITS model."""
+"""PyTorch VITS model."""
 
 import math
 from dataclasses import dataclass
diff --git a/src/transformers/models/vits/tokenization_vits.py b/src/transformers/models/vits/tokenization_vits.py
index c8b115c176..4c02857483 100644
--- a/src/transformers/models/vits/tokenization_vits.py
+++ b/src/transformers/models/vits/tokenization_vits.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization class for VITS."""
 
-
 import json
 import os
 import re
diff --git a/src/transformers/models/vivit/configuration_vivit.py b/src/transformers/models/vivit/configuration_vivit.py
index 4cbebc7692..63895e4fb7 100644
--- a/src/transformers/models/vivit/configuration_vivit.py
+++ b/src/transformers/models/vivit/configuration_vivit.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViViT model configuration"""
+"""ViViT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
index 571dfe896e..c3075d6034 100644
--- a/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
+++ b/src/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
@@ -15,6 +15,7 @@
 """Convert Flax ViViT checkpoints from the original repository to PyTorch. URL:
 https://github.com/google-research/scenic/tree/main/scenic/projects/vivit
 """
+
 import argparse
 import json
 import os.path
diff --git a/src/transformers/models/vivit/image_processing_vivit.py b/src/transformers/models/vivit/image_processing_vivit.py
index 9b62aedc23..35babdfd2c 100644
--- a/src/transformers/models/vivit/image_processing_vivit.py
+++ b/src/transformers/models/vivit/image_processing_vivit.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Image processor class for Vivit."""
+
 from typing import Dict, List, Optional, Union
 
 import numpy as np
diff --git a/src/transformers/models/vivit/modeling_vivit.py b/src/transformers/models/vivit/modeling_vivit.py
index c8bac4f02e..97891bee1a 100755
--- a/src/transformers/models/vivit/modeling_vivit.py
+++ b/src/transformers/models/vivit/modeling_vivit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViViT model."""
-
+"""PyTorch ViViT model."""
 
 import math
 from typing import Optional, Set, Tuple, Union
diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
index 1d6777efcb..b4a676ddba 100644
--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Wav2Vec2 model configuration"""
+"""Wav2Vec2 model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
index 28554691c6..0a6aff8c81 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Wav2Vec2 checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
index bcc9fd95a4..1702bc5a47 100644
--- a/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Hubert checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 86cfb5e089..9a24b9d39f 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax Wav2Vec2 model."""
+"""Flax Wav2Vec2 model."""
 
 from functools import partial
 from typing import Optional, Tuple, Union
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index efbae8f232..a56a2cb55e 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow Wav2Vec2 model."""
-
+"""TensorFlow Wav2Vec2 model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 5fb64c1f2c..bdf2d4f50a 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Wav2Vec2 model."""
+"""PyTorch Wav2Vec2 model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index dc6e9d14ee..1b47c0a988 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for Wav2Vec2
 """
+
 import warnings
 from contextlib import contextmanager
 
diff --git a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
index f6c364884b..20b4e4fa13 100644
--- a/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Wav2Vec2Bert model configuration"""
-
+"""Wav2Vec2Bert model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
index 8b77cd71f7..6405f45470 100644
--- a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
+++ b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Wav2Vec2Bert BERT checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
index 077546a65e..db4e5985e6 100644
--- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Wav2Vec2-BERT model."""
+"""PyTorch Wav2Vec2-BERT model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
index ec792ce75a..d24c672007 100644
--- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for Wav2Vec2-BERT
 """
+
 import warnings
 
 from ...processing_utils import ProcessorMixin
diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
index 5c931342c9..05cdceeb9c 100644
--- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Wav2Vec2Conformer model configuration"""
+"""Wav2Vec2Conformer model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
index 1a882e95ab..8c435c6cd9 100644
--- a/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Wav2Vec2Conformer checkpoint."""
 
-
 import argparse
 import json
 import os
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index da8ed52faf..14e80b516c 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Wav2Vec2-Conformer model."""
+"""PyTorch Wav2Vec2-Conformer model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index 8809e2c2e8..ff4704c778 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -78,7 +78,6 @@ class Wav2Vec2PhonemeCTCTokenizerOutput(ModelOutput):
 
 
 class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
-
     """
     Constructs a Wav2Vec2PhonemeCTC tokenizer.
 
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 9fceb1e61a..30652d5f3a 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for Wav2Vec2
 """
+
 import os
 import warnings
 from contextlib import contextmanager, nullcontext
diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py
index a860475336..3faeb7ab53 100644
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" WavLM model configuration"""
+"""WavLM model configuration"""
 
 import functools
 import operator
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
index 84e3d231ea..e41ae0881d 100644
--- a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert WavLM checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
index e41aa0099a..447d4db67f 100644
--- a/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert Hubert checkpoint."""
 
-
 import argparse
 
 import torch
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index dd52277b9a..320959b439 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch WavLM model."""
+"""PyTorch WavLM model."""
 
 import math
 import warnings
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index c924a21c2a..e7c0f47b58 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Whisper model configuration"""
+"""Whisper model configuration"""
 
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index f2d6da5660..22f31c4b2e 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -15,6 +15,7 @@
 """
 Feature extractor class for Whisper
 """
+
 from typing import List, Optional, Union
 
 import numpy as np
diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py
index 14ce899ab1..8ad4ed9de2 100644
--- a/src/transformers/models/whisper/modeling_flax_whisper.py
+++ b/src/transformers/models/whisper/modeling_flax_whisper.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax whisper model."""
+"""Flax whisper model."""
 
 import math
 import random
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 8033bb584f..18f55dce8a 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TensorFlow Whisper model."""
-
+"""TensorFlow Whisper model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index e4fda437bf..f24da860a1 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch Whisper model."""
+"""PyTorch Whisper model."""
+
 import math
 from typing import Optional, Tuple, Union
 
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index cae72aa0e1..f22aae143e 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -16,7 +16,6 @@
 Speech processor class for Whisper
 """
 
-
 from ...processing_utils import ProcessorMixin
 
 
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 0a6ad5be6e..5f7f0ae53d 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for Whisper."""
+
 import json
 import os
 import warnings
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
index 54aa60839c..a9e57cca72 100644
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ b/src/transformers/models/whisper/tokenization_whisper_fast.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for Whisper."""
+
 import json
 import os
 import re
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 757429d6df..6314616eef 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" X-CLIP model configuration"""
+"""X-CLIP model configuration"""
 
 import os
 from typing import Union
diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py
index 092ea94761..21b1c88aa0 100644
--- a/src/transformers/models/x_clip/modeling_x_clip.py
+++ b/src/transformers/models/x_clip/modeling_x_clip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch X-CLIP model."""
-
+"""PyTorch X-CLIP model."""
 
 from copy import copy
 from dataclasses import dataclass
diff --git a/src/transformers/models/xglm/configuration_xglm.py b/src/transformers/models/xglm/configuration_xglm.py
index 8eebcfaee6..c5a275405d 100644
--- a/src/transformers/models/xglm/configuration_xglm.py
+++ b/src/transformers/models/xglm/configuration_xglm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XGLM model configuration"""
+"""XGLM model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py
index d6b90a7f00..473448c66c 100644
--- a/src/transformers/models/xglm/modeling_flax_xglm.py
+++ b/src/transformers/models/xglm/modeling_flax_xglm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Flax XGLM model."""
-
+"""Flax XGLM model."""
 
 import math
 import random
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 6b563b6657..a62396b79c 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 XGLM model."""
-
+"""TF 2.0 XGLM model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index 27d5fa36d7..4f16935834 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch XGLM model."""
-
+"""PyTorch XGLM model."""
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py
index 818ca163da..8713d5f129 100644
--- a/src/transformers/models/xglm/tokenization_xglm.py
+++ b/src/transformers/models/xglm/tokenization_xglm.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for ."""
+
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py
index 2f8b5d6ef2..39db12c51b 100644
--- a/src/transformers/models/xlm/configuration_xlm.py
+++ b/src/transformers/models/xlm/configuration_xlm.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XLM configuration"""
+"""XLM configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
index 6f3cdf920a..71c3a1f989 100755
--- a/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert OpenAI GPT checkpoint."""
 
-
 import argparse
 import json
 
diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py
index ff5211a27c..f03f416a08 100644
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -13,10 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- TF 2.0 XLM model.
+TF 2.0 XLM model.
 """
 
-
 from __future__ import annotations
 
 import itertools
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 23f93bebb7..2803836309 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- PyTorch XLM model.
+PyTorch XLM model.
 """
 
 import itertools
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index b39e4c2708..b20823e017 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for XLM."""
 
-
 import json
 import os
 import re
@@ -129,8 +128,8 @@ def romanian_preprocessing(text):
     text = text.replace("\u0218", "S").replace("\u0219", "s")  # s-comma
     text = text.replace("\u021a", "T").replace("\u021b", "t")  # t-comma
     text = text.replace("\u0102", "A").replace("\u0103", "a")
-    text = text.replace("\u00C2", "A").replace("\u00E2", "a")
-    text = text.replace("\u00CE", "I").replace("\u00EE", "i")
+    text = text.replace("\u00c2", "A").replace("\u00e2", "a")
+    text = text.replace("\u00ce", "I").replace("\u00ee", "i")
     return text
 
 
diff --git a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
index bfb2a898f0..94d38242b6 100644
--- a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XLM-ProphetNet model configuration"""
-
+"""XLM-ProphetNet model configuration"""
 
 from typing import Callable, Optional, Union
 
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index ccba8fde5d..669c21026e 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch XLM-ProphetNet model."""
-
+"""PyTorch XLM-ProphetNet model."""
 
 import copy
 import math
diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
index b9e348e498..100321db48 100644
--- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XLM-RoBERTa configuration"""
+"""XLM-RoBERTa configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index c0ad2c7c7d..efc5b69667 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" TF 2.0 XLM-RoBERTa model."""
-
+"""TF 2.0 XLM-RoBERTa model."""
 
 from __future__ import annotations
 
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 20300a19ce..35a06aeb91 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for XLM-RoBERTa model."""
-
+"""Tokenization classes for XLM-RoBERTa model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index f32e715154..4ad2596a6f 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-""" Tokenization classes for XLM-RoBERTa model."""
-
+"""Tokenization classes for XLM-RoBERTa model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
index ac7b13755c..6ee323ae76 100644
--- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XLM_ROBERTa_XL configuration"""
+"""XLM_ROBERTa_XL configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py
index 957a09dbf6..0a35b204f4 100644
--- a/src/transformers/models/xlnet/configuration_xlnet.py
+++ b/src/transformers/models/xlnet/configuration_xlnet.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" XLNet configuration"""
+"""XLNet configuration"""
 
 import warnings
 
diff --git a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
index 804b52b0dc..a15c5f22ad 100755
--- a/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert BERT checkpoint."""
 
-
 import argparse
 import os
 
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index f998b19bb6..eeacebae04 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -14,10 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- TF 2.0 XLNet model.
+TF 2.0 XLNet model.
 """
 
-
 from __future__ import annotations
 
 import warnings
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 7d04fe8cfa..5d424ebe12 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -14,8 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- PyTorch XLNet model.
+PyTorch XLNet model.
 """
+
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 8d87f34ba2..9d4b35775e 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for XLNet model."""
-
+"""Tokenization classes for XLNet model."""
 
 import os
 import unicodedata
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index d77307e7a3..a506e8c45a 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Tokenization classes for XLNet model."""
-
+"""Tokenization classes for XLNet model."""
 
 import os
 from shutil import copyfile
diff --git a/src/transformers/models/xmod/configuration_xmod.py b/src/transformers/models/xmod/configuration_xmod.py
index 4ca5265251..34261a0d7c 100644
--- a/src/transformers/models/xmod/configuration_xmod.py
+++ b/src/transformers/models/xmod/configuration_xmod.py
@@ -13,7 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" X-MOD configuration"""
+"""X-MOD configuration"""
+
 from collections import OrderedDict
 from typing import Mapping
 
diff --git a/src/transformers/models/yolos/configuration_yolos.py b/src/transformers/models/yolos/configuration_yolos.py
index 2493403b4f..d6fe04f426 100644
--- a/src/transformers/models/yolos/configuration_yolos.py
+++ b/src/transformers/models/yolos/configuration_yolos.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" YOLOS model configuration"""
+"""YOLOS model configuration"""
 
 from collections import OrderedDict
 from typing import Mapping
diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
index 35238151ab..6cddc60661 100644
--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Convert YOLOS checkpoints from the original repository. URL: https://github.com/hustvl/YOLOS"""
 
-
 import argparse
 import json
 from pathlib import Path
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index faae349e6e..2acf48849a 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch YOLOS model."""
-
+"""PyTorch YOLOS model."""
 
 import collections.abc
 import math
diff --git a/src/transformers/models/yoso/configuration_yoso.py b/src/transformers/models/yoso/configuration_yoso.py
index 906856fa5d..e353744cc7 100644
--- a/src/transformers/models/yoso/configuration_yoso.py
+++ b/src/transformers/models/yoso/configuration_yoso.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" YOSO model configuration"""
+"""YOSO model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index c2d327bf84..a94c6de542 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch YOSO model."""
-
+"""PyTorch YOSO model."""
 
 import math
 from pathlib import Path
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 25023430ed..f27913156c 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Functions and classes related to optimization (weight updates)."""
 
-
 import re
 from typing import Callable, List, Optional, Union
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index a8e47fb683..d76fa4dccc 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Processing saving/loading class for common processors.
+Processing saving/loading class for common processors.
 """
 
 import copy
diff --git a/src/transformers/quantizers/base.py b/src/transformers/quantizers/base.py
index 1cfb0d5856..3ee28ada1b 100644
--- a/src/transformers/quantizers/base.py
+++ b/src/transformers/quantizers/base.py
@@ -212,19 +212,15 @@ class HfQuantizer(ABC):
         )
 
     @abstractmethod
-    def _process_model_before_weight_loading(self, model, **kwargs):
-        ...
+    def _process_model_before_weight_loading(self, model, **kwargs): ...
 
     @abstractmethod
-    def _process_model_after_weight_loading(self, model, **kwargs):
-        ...
+    def _process_model_after_weight_loading(self, model, **kwargs): ...
 
     @property
     @abstractmethod
-    def is_serializable(self):
-        ...
+    def is_serializable(self): ...
 
     @property
     @abstractmethod
-    def is_trainable(self):
-        ...
+    def is_trainable(self): ...
diff --git a/src/transformers/time_series_utils.py b/src/transformers/time_series_utils.py
index 02eddd72ce..7d9716e481 100644
--- a/src/transformers/time_series_utils.py
+++ b/src/transformers/time_series_utils.py
@@ -16,6 +16,7 @@
 """
 Time series distributional output classes and utilities.
 """
+
 from typing import Callable, Dict, Optional, Tuple
 
 import torch
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 8f1b15c1c1..f936bc25ad 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
- tokenization_utils_fast.py
+Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
+tokenization_utils_fast.py
 """
+
 import bisect
 import itertools
 import re
@@ -943,12 +944,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
     @overload
-    def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
-        ...
+    def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
 
     @overload
-    def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
-        ...
+    def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]: ...
 
     def convert_ids_to_tokens(
         self, ids: Union[int, List[int]], skip_special_tokens: bool = False
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 494c049a15..07e1ef3651 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -13,9 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
- see tokenization_utils.py
+Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
+see tokenization_utils.py
 """
+
 import copy
 import json
 import os
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index d2570ed8ba..45ecf7c80c 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -15,6 +15,7 @@
 """
 Callbacks to use with the Trainer class and customize the training loop.
 """
+
 import copy
 import dataclasses
 import json
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 14fcfe4a50..6f72494ad8 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Collection of utils to be used by backbones and their components."""
+"""Collection of utils to be used by backbones and their components."""
 
 import enum
 import inspect
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index b19efac130..c3687c035c 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -634,9 +634,9 @@ _MANUAL_META_OVERRIDES: Dict[Callable, Callable] = {
 }
 
 if is_torch_greater_or_equal_than_2_0:
-    _MANUAL_META_OVERRIDES[
-        torch.nn.functional.scaled_dot_product_attention
-    ] = torch_nn_functional_scaled_dot_product_attention
+    _MANUAL_META_OVERRIDES[torch.nn.functional.scaled_dot_product_attention] = (
+        torch_nn_functional_scaled_dot_product_attention
+    )
 
 
 class HFProxy(Proxy):
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index ef32ecde72..173fcb352d 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -14,6 +14,7 @@
 """
 Hub utilities: utilities related to download and cache models
 """
+
 import json
 import os
 import re
diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py
index 3471e5ab66..f2fbe393f7 100644
--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Logging utilities."""
-
+"""Logging utilities."""
 
 import functools
 import logging
diff --git a/src/transformers/utils/sentencepiece_model_pb2_new.py b/src/transformers/utils/sentencepiece_model_pb2_new.py
index 4a2e29b1bd..65d03e5a28 100644
--- a/src/transformers/utils/sentencepiece_model_pb2_new.py
+++ b/src/transformers/utils/sentencepiece_model_pb2_new.py
@@ -2,6 +2,7 @@
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: sentencepiece_model.proto
 """Generated protocol buffer code."""
+
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import descriptor_pool as _descriptor_pool
 from google.protobuf import symbol_database as _symbol_database
diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py
index cf6962a3c6..f2d1655f25 100644
--- a/tests/models/align/test_modeling_align.py
+++ b/tests/models/align/test_modeling_align.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ALIGN model. """
-
+"""Testing suite for the PyTorch ALIGN model."""
 
 import inspect
 import os
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index 8c5be789c0..c7c735b19f 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch AltCLIP model. """
-
+"""Testing suite for the PyTorch AltCLIP model."""
 
 import inspect
 import os
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index ad10b99547..9ab07b5508 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Audio Spectrogram Transformer (AST) model. """
+"""Testing suite for the PyTorch Audio Spectrogram Transformer (AST) model."""
 
 import inspect
 import unittest
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index 265f5dd7b7..fb00c403f6 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Autoformer model. """
+"""Testing suite for the PyTorch Autoformer model."""
 
 import inspect
 import tempfile
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index bfc6a4dadf..67c18288a8 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Bark model. """
-
+"""Testing suite for the PyTorch Bark model."""
 
 import copy
 import inspect
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index 3804933735..9ca4db900d 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BART model. """
-
+"""Testing suite for the PyTorch BART model."""
 
 import copy
 import tempfile
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index 50287cb7bc..1010c6007d 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BEiT model. """
-
+"""Testing suite for the PyTorch BEiT model."""
 
 import unittest
 
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index cf3cc1dce1..5cebf58029 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -66,14 +66,14 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
 
@@ -84,7 +84,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -103,7 +103,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer(do_lower_case=True)
         rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -121,7 +121,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -129,7 +129,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_false(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
@@ -137,7 +137,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     def test_basic_tokenizer_lower_strip_accents_true(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
@@ -145,7 +145,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_default(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -153,7 +153,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
@@ -208,7 +208,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/bert/test_tokenization_bert_tf.py b/tests/models/bert/test_tokenization_bert_tf.py
index f950e7439c..0539613a10 100644
--- a/tests/models/bert/test_tokenization_bert_tf.py
+++ b/tests/models/bert/test_tokenization_bert_tf.py
@@ -50,7 +50,7 @@ class BertTokenizationTest(unittest.TestCase):
 
         self.test_sentences = [
             "This is a straightforward English test sentence.",
-            "This one has some weird characters\rto\nsee\r\nif  those\u00E9break things.",
+            "This one has some weird characters\rto\nsee\r\nif  those\u00e9break things.",
             "Now we're going to add some Chinese: 一 二 三 一二三",
             "And some much more rare Chinese: 齉 堃 齉堃",
             "Je vais aussi écrire en français pour tester les accents",
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index 02af95879a..dd22eb4a6d 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BigBird model. """
-
+"""Testing suite for the PyTorch BigBird model."""
 
 import unittest
 
diff --git a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
index 82b7cb574d..8a009aef5e 100644
--- a/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
+++ b/tests/models/bigbird_pegasus/test_modeling_bigbird_pegasus.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BigBirdPegasus model. """
-
+"""Testing suite for the PyTorch BigBirdPegasus model."""
 
 import copy
 import tempfile
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index 58dd39e86a..51e836a1f9 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BioGPT model. """
+"""Testing suite for the PyTorch BioGPT model."""
 
 import math
 import unittest
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index dbc4cacdeb..ec666ec973 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Bit model. """
-
+"""Testing suite for the PyTorch Bit model."""
 
 import unittest
 
diff --git a/tests/models/blenderbot/test_modeling_blenderbot.py b/tests/models/blenderbot/test_modeling_blenderbot.py
index 64ae71b24b..eaab22b7d5 100644
--- a/tests/models/blenderbot/test_modeling_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_blenderbot.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Blenderbot model. """
+"""Testing suite for the PyTorch Blenderbot model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/blenderbot/test_tokenization_blenderbot.py b/tests/models/blenderbot/test_tokenization_blenderbot.py
index 7fbf2b7603..70f55f9448 100644
--- a/tests/models/blenderbot/test_tokenization_blenderbot.py
+++ b/tests/models/blenderbot/test_tokenization_blenderbot.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
+
 import unittest
 
 from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index 39e953490f..8f847eb52d 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BlenderbotSmall model. """
+"""Testing suite for the PyTorch BlenderbotSmall model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
index 369dde6739..7ee3e989fb 100644
--- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for the Blenderbot small tokenizer."""
+
 import json
 import os
 import unittest
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 89404342f0..2ea4f09214 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Blip model. """
-
+"""Testing suite for the PyTorch Blip model."""
 
 import inspect
 import os
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index 3c12a7e9ea..ea6e138b7e 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Blip model. """
+"""Testing suite for the PyTorch Blip model."""
+
 import unittest
 
 import numpy as np
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index a35eb7a1bd..06e7d7e224 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow Blip model. """
-
+"""Testing suite for the TensorFlow Blip model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/blip/test_modeling_tf_blip_text.py b/tests/models/blip/test_modeling_tf_blip_text.py
index 7583b61b58..dff9bef82e 100644
--- a/tests/models/blip/test_modeling_tf_blip_text.py
+++ b/tests/models/blip/test_modeling_tf_blip_text.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow Blip model. """
+"""Testing suite for the TensorFlow Blip model."""
+
 from __future__ import annotations
 
 import unittest
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index d2f3b2b719..f32ffea850 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BLIP-2 model. """
-
+"""Testing suite for the PyTorch BLIP-2 model."""
 
 import inspect
 import tempfile
diff --git a/tests/models/bridgetower/test_modeling_bridgetower.py b/tests/models/bridgetower/test_modeling_bridgetower.py
index c9ce8f076f..b8de60ac5c 100644
--- a/tests/models/bridgetower/test_modeling_bridgetower.py
+++ b/tests/models/bridgetower/test_modeling_bridgetower.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch BridgeTower model. """
+"""Testing suite for the PyTorch BridgeTower model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/bros/test_modeling_bros.py b/tests/models/bros/test_modeling_bros.py
index 4b1290ed49..14f904c3b7 100644
--- a/tests/models/bros/test_modeling_bros.py
+++ b/tests/models/bros/test_modeling_bros.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Bros model. """
+"""Testing suite for the PyTorch Bros model."""
 
 import copy
 import unittest
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index 3793241d7e..0033021ea7 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -181,7 +181,7 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
 
@@ -197,7 +197,7 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index 5c342ee975..2d2dfef40e 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch CANINE model. """
-
+"""Testing suite for the PyTorch CANINE model."""
 
 import unittest
 from typing import List, Tuple
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index ec987f6dd6..d34ac324ea 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -98,7 +98,7 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
 
@@ -114,7 +114,7 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
 
                 additional_special_tokens = tokenizer.additional_special_tokens
 
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 8ee9028eca..0b8220c8ed 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Chinese-CLIP model. """
+"""Testing suite for the PyTorch Chinese-CLIP model."""
 
 import inspect
 import os
diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py
index f06fabf0a2..835a06b0ce 100644
--- a/tests/models/clap/test_modeling_clap.py
+++ b/tests/models/clap/test_modeling_clap.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch CLAP model. """
-
+"""Testing suite for the PyTorch CLAP model."""
 
 import inspect
 import os
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 16c8f47b78..fa33176363 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch CLIP model. """
-
+"""Testing suite for the PyTorch CLIP model."""
 
 import inspect
 import os
diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py
index 4e1ec7f88e..83080db94b 100644
--- a/tests/models/clip/test_modeling_tf_clip.py
+++ b/tests/models/clip/test_modeling_tf_clip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow CLIP model. """
-
+"""Testing suite for the TensorFlow CLIP model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py
index ec1cbd08ac..5885f8933c 100644
--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -97,11 +97,11 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Test that the tokenization is identical on unicode of space type
                 spaces_unicodes = [
                     "\u0009",  # (horizontal tab, '\t')
-                    "\u000B",  # (vertical tab)
-                    "\u000C",  # (form feed)
+                    "\u000b",  # (vertical tab)
+                    "\u000c",  # (form feed)
                     "\u0020",  # (space, ' ')
-                    "\u200E",  # (left-to-right mark):w
-                    "\u200F",  # (right-to-left mark)
+                    "\u200e",  # (left-to-right mark):w
+                    "\u200f",  # (right-to-left mark)
                 ]
                 for unicode_seq in spaces_unicodes:
                     text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
@@ -111,11 +111,11 @@ class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
                 # Test that the tokenization is identical on unicode of line break type
                 line_break_unicodes = [
-                    "\u000A",  # (line feed, '\n')
+                    "\u000a",  # (line feed, '\n')
                     "\r\n",  # (carriage return and line feed, '\r\n')
-                    "\u000D",  # (carriage return, '\r')
+                    "\u000d",  # (carriage return, '\r')
                     "\r",  # (carriage return, '\r')
-                    "\u000D",  # (carriage return, '\r')
+                    "\u000d",  # (carriage return, '\r')
                     "\u2028",  # (line separator)
                     "\u2029",  # (paragraph separator)
                     # "\u0085", # (next line)
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index 6b82a9af41..9e819d3b26 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch CLIPSeg model. """
-
+"""Testing suite for the PyTorch CLIPSeg model."""
 
 import inspect
 import os
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 7d5064377f..1370c9acc8 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Clvp model. """
-
+"""Testing suite for the PyTorch Clvp model."""
 
 import gc
 import tempfile
diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py
index 07fd363724..a6fb6b39ff 100644
--- a/tests/models/cohere/test_modeling_cohere.py
+++ b/tests/models/cohere/test_modeling_cohere.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Cohere model. """
+"""Testing suite for the PyTorch Cohere model."""
 
 import unittest
 
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index a01acffafb..6fea53fc66 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Conditional DETR model. """
-
+"""Testing suite for the PyTorch Conditional DETR model."""
 
 import inspect
 import math
diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py
index 80a31c9bc2..0866f76798 100644
--- a/tests/models/convbert/test_modeling_convbert.py
+++ b/tests/models/convbert/test_modeling_convbert.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ConvBERT model. """
+"""Testing suite for the PyTorch ConvBERT model."""
+
 import os
 import tempfile
 import unittest
diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py
index 9f0789dffc..b618608f0e 100644
--- a/tests/models/convnext/test_modeling_convnext.py
+++ b/tests/models/convnext/test_modeling_convnext.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ConvNext model. """
-
+"""Testing suite for the PyTorch ConvNext model."""
 
 import unittest
 
diff --git a/tests/models/convnext/test_modeling_tf_convnext.py b/tests/models/convnext/test_modeling_tf_convnext.py
index 4a06632513..ec8ea8e13d 100644
--- a/tests/models/convnext/test_modeling_tf_convnext.py
+++ b/tests/models/convnext/test_modeling_tf_convnext.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow ConvNext model. """
+"""Testing suite for the TensorFlow ConvNext model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
index 5d78d31c3e..3201de6f93 100644
--- a/tests/models/convnextv2/test_modeling_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ConvNextV2 model. """
-
+"""Testing suite for the PyTorch ConvNextV2 model."""
 
 import unittest
 
diff --git a/tests/models/convnextv2/test_modeling_tf_convnextv2.py b/tests/models/convnextv2/test_modeling_tf_convnextv2.py
index 5f0bd2f29d..58b4795afd 100644
--- a/tests/models/convnextv2/test_modeling_tf_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_tf_convnextv2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow ConvNext model. """
+"""Testing suite for the TensorFlow ConvNext model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index 7a037becbf..5473791b85 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch CPMAnt model. """
+"""Testing suite for the PyTorch CPMAnt model."""
 
 import unittest
 
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index 8e9376de27..9facafcd77 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch CvT model. """
-
+"""Testing suite for the PyTorch CvT model."""
 
 import unittest
 from math import floor
diff --git a/tests/models/cvt/test_modeling_tf_cvt.py b/tests/models/cvt/test_modeling_tf_cvt.py
index 0cae0bbcf2..211529719a 100644
--- a/tests/models/cvt/test_modeling_tf_cvt.py
+++ b/tests/models/cvt/test_modeling_tf_cvt.py
@@ -1,5 +1,4 @@
-""" Testing suite for the Tensorflow CvT model. """
-
+"""Testing suite for the Tensorflow CvT model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py
index b9e3bff346..0376edad2d 100644
--- a/tests/models/data2vec/test_modeling_data2vec_audio.py
+++ b/tests/models/data2vec/test_modeling_data2vec_audio.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Data2VecAudio model. """
+"""Testing suite for the PyTorch Data2VecAudio model."""
 
 import math
 import unittest
diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py
index 5a3edaa7ad..c5e07f8e2e 100644
--- a/tests/models/data2vec/test_modeling_data2vec_text.py
+++ b/tests/models/data2vec/test_modeling_data2vec_text.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Data2VecAudio model. """
+"""Testing suite for the PyTorch Data2VecAudio model."""
 
 import unittest
 
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index c426a6ca7e..99cbd66fbb 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Data2VecVision model. """
-
+"""Testing suite for the PyTorch Data2VecVision model."""
 
 import unittest
 
diff --git a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
index bb6e0d5476..5ccbca8f0d 100644
--- a/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_tf_data2vec_vision.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow Data2VecVision model. """
+"""Testing suite for the TensorFlow Data2VecVision model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 4c6b74a4d7..66c9600084 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DBRX model. """
-
+"""Testing suite for the PyTorch DBRX model."""
 
 import unittest
 
diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py
index f7f362dce8..5d69bbd0b2 100644
--- a/tests/models/decision_transformer/test_modeling_decision_transformer.py
+++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DecisionTransformer model. """
-
+"""Testing suite for the PyTorch DecisionTransformer model."""
 
 import inspect
 import unittest
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 5831d338dc..2ae3e3f088 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Deformable DETR model. """
-
+"""Testing suite for the PyTorch Deformable DETR model."""
 
 import inspect
 import math
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index adc7154a50..f558ee09a5 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DeiT model. """
-
+"""Testing suite for the PyTorch DeiT model."""
 
 import unittest
 import warnings
diff --git a/tests/models/deit/test_modeling_tf_deit.py b/tests/models/deit/test_modeling_tf_deit.py
index c26635cef6..fefd506800 100644
--- a/tests/models/deit/test_modeling_tf_deit.py
+++ b/tests/models/deit/test_modeling_tf_deit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow DeiT model. """
-
+"""Testing suite for the TensorFlow DeiT model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py
index 3b807abf71..ef1326520a 100644
--- a/tests/models/depth_anything/test_modeling_depth_anything.py
+++ b/tests/models/depth_anything/test_modeling_depth_anything.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Depth Anything model. """
-
+"""Testing suite for the PyTorch Depth Anything model."""
 
 import unittest
 
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
index faab773efd..fa840212a5 100644
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DETA model. """
-
+"""Testing suite for the PyTorch DETA model."""
 
 import collections
 import inspect
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index ee1af9ed9d..f6277cced3 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DETR model. """
-
+"""Testing suite for the PyTorch DETR model."""
 
 import inspect
 import math
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index 158ce77395..2fe9de14b9 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Dinat model. """
+"""Testing suite for the PyTorch Dinat model."""
 
 import collections
 import unittest
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index 9896f2c2bb..5bf2d51b2d 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Dinov2 model. """
-
+"""Testing suite for the PyTorch Dinov2 model."""
 
 import unittest
 
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index 4d9be165bb..8b86dffab1 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Donut Swin model. """
+"""Testing suite for the PyTorch Donut Swin model."""
 
 import collections
 import unittest
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index a49f8d5d94..8c6231bc1c 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DPT model. """
-
+"""Testing suite for the PyTorch DPT model."""
 
 import unittest
 
diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
index 01d5398edd..2b37b816f4 100644
--- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py
+++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DPT model. """
-
+"""Testing suite for the PyTorch DPT model."""
 
 import unittest
 
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index a63e736e41..2caa8d71ae 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch DPT model. """
-
+"""Testing suite for the PyTorch DPT model."""
 
 import unittest
 
diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py
index 15a4cb0be3..6b7ce810ce 100644
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch EfficientFormer model. """
-
+"""Testing suite for the PyTorch EfficientFormer model."""
 
 import unittest
 import warnings
diff --git a/tests/models/efficientformer/test_modeling_tf_efficientformer.py b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
index fcd6958ed3..abb0878740 100644
--- a/tests/models/efficientformer/test_modeling_tf_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_tf_efficientformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow EfficientFormer model. """
+"""Testing suite for the TensorFlow EfficientFormer model."""
 
 import inspect
 import unittest
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
index dbca9b31a2..eaf0f4ec87 100644
--- a/tests/models/efficientnet/test_modeling_efficientnet.py
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch EfficientNet model. """
-
+"""Testing suite for the PyTorch EfficientNet model."""
 
 import unittest
 
diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py
index 64611cb09c..f3648e9863 100644
--- a/tests/models/electra/test_tokenization_electra.py
+++ b/tests/models/electra/test_tokenization_electra.py
@@ -65,14 +65,14 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
 
@@ -83,7 +83,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -102,7 +102,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer(do_lower_case=True)
         rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -120,7 +120,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -128,7 +128,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_false(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
@@ -136,7 +136,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     def test_basic_tokenizer_lower_strip_accents_true(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
@@ -144,7 +144,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_default(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -152,7 +152,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
@@ -201,7 +201,7 @@ class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py
index dd07f5a6e2..c2974943e0 100644
--- a/tests/models/encodec/test_modeling_encodec.py
+++ b/tests/models/encodec/test_modeling_encodec.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Encodec model. """
+"""Testing suite for the PyTorch Encodec model."""
 
 import copy
 import inspect
diff --git a/tests/models/ernie_m/test_modeling_ernie_m.py b/tests/models/ernie_m/test_modeling_ernie_m.py
index e429a12e6e..17c9aa89f3 100644
--- a/tests/models/ernie_m/test_modeling_ernie_m.py
+++ b/tests/models/ernie_m/test_modeling_ernie_m.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ErnieM model. """
-
+"""Testing suite for the PyTorch ErnieM model."""
 
 import unittest
 
diff --git a/tests/models/ernie_m/test_tokenization_ernie_m.py b/tests/models/ernie_m/test_tokenization_ernie_m.py
index 5cc5ec6991..01de7d3731 100644
--- a/tests/models/ernie_m/test_tokenization_ernie_m.py
+++ b/tests/models/ernie_m/test_tokenization_ernie_m.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ErnieM model. """
+"""Testing suite for the PyTorch ErnieM model."""
 
 import unittest
 
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index db3ccd6fd2..e8e72184b3 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ESM model. """
-
+"""Testing suite for the PyTorch ESM model."""
 
 import unittest
 
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index 1ec5ab8acb..11306e7368 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ESM model. """
-
+"""Testing suite for the PyTorch ESM model."""
 
 import unittest
 
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 59ab316140..50e8fcdbb4 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Falcon model. """
-
+"""Testing suite for the PyTorch Falcon model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
index 4cf104e693..cf57ee079a 100644
--- a/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
+++ b/tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch FastSpeech2Conformer model."""
+"""Testing suite for the PyTorch FastSpeech2Conformer model."""
 
 import inspect
 import tempfile
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 2b628e1413..1287a297a4 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch FLAVA model. """
-
+"""Testing suite for the PyTorch FLAVA model."""
 
 import inspect
 import os
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 5941e2006c..8686c60ab6 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch FNet model. """
-
+"""Testing suite for the PyTorch FNet model."""
 
 import unittest
 from typing import Dict, List, Tuple
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index fb2bb1c2c1..2cd29fc2c3 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch FocalNet model. """
+"""Testing suite for the PyTorch FocalNet model."""
 
 import collections
 import unittest
diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py
index 7628582e9f..9ddb3b325d 100644
--- a/tests/models/funnel/test_tokenization_funnel.py
+++ b/tests/models/funnel/test_tokenization_funnel.py
@@ -61,23 +61,23 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         return FunnelTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
     def test_token_type_ids(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
-            inputs = tokenizer("UNwant\u00E9d,running")
+            inputs = tokenizer("UNwant\u00e9d,running")
             sentence_len = len(inputs["input_ids"]) - 1
             self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len)
 
-            inputs = tokenizer("UNwant\u00E9d,running", "UNwant\u00E9d,running")
+            inputs = tokenizer("UNwant\u00e9d,running", "UNwant\u00e9d,running")
             self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len + [1] * sentence_len)
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index 84c9128892..f65498af33 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Fuyu model. """
+"""Testing suite for the PyTorch Fuyu model."""
 
 import io
 import unittest
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index fa0db9ee78..6aeb5f23c3 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Gemma model. """
+"""Testing suite for the PyTorch Gemma model."""
+
 import tempfile
 import unittest
 
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index b733164ec1..2e1486039e 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch GLPN model. """
-
+"""Testing suite for the PyTorch GLPN model."""
 
 import unittest
 
diff --git a/tests/models/gpt2/test_tokenization_gpt2_tf.py b/tests/models/gpt2/test_tokenization_gpt2_tf.py
index 0cea50db31..06f16c36e3 100644
--- a/tests/models/gpt2/test_tokenization_gpt2_tf.py
+++ b/tests/models/gpt2/test_tokenization_gpt2_tf.py
@@ -55,7 +55,7 @@ class GPTTokenizationTest(unittest.TestCase):
 
         self.test_sentences = [
             "This is a straightforward English test sentence.",
-            "This one has some weird characters\rto\nsee\r\nif  those\u00E9break things.",
+            "This one has some weird characters\rto\nsee\r\nif  those\u00e9break things.",
             "Now we're going to add some Chinese: 一 二 三 一二三",
             "And some much more rare Chinese: 齉 堃 齉堃",
             "Je vais aussi écrire en français pour tester les accents",
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index ce0aeadf16..5f4fd4d119 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch GPT Neo model. """
-
+"""Testing suite for the PyTorch GPT Neo model."""
 
 import unittest
 
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 92d130b351..ed5bcac55e 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch GPTNeoX model. """
-
+"""Testing suite for the PyTorch GPTNeoX model."""
 
 import unittest
 
diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
index 752b7a0b8d..52e9d5d5b1 100644
--- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch GPTNeoXJapanese model. """
-
+"""Testing suite for the PyTorch GPTNeoXJapanese model."""
 
 import unittest
 
diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py
index ddb72543f5..55b1ccc34a 100644
--- a/tests/models/graphormer/test_modeling_graphormer.py
+++ b/tests/models/graphormer/test_modeling_graphormer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Graphormer model. """
-
+"""Testing suite for the PyTorch Graphormer model."""
 
 import copy
 import inspect
diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py
index 1231baff7c..12f80260cb 100644
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Grounding DINO model. """
+"""Testing suite for the PyTorch Grounding DINO model."""
 
 import collections
 import inspect
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 5ec9bbbf1a..15e7ad29fc 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch GroupViT model. """
-
+"""Testing suite for the PyTorch GroupViT model."""
 
 import inspect
 import os
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
index be5ff803d9..3389735734 100644
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow GroupViT model. """
-
+"""Testing suite for the TensorFlow GroupViT model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py
index d1a0558b4e..3e54e7d2db 100644
--- a/tests/models/hubert/test_modeling_hubert.py
+++ b/tests/models/hubert/test_modeling_hubert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Hubert model. """
-
+"""Testing suite for the PyTorch Hubert model."""
 
 import math
 import os
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 5c3d45d2e8..91a9f66166 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Idefics model. """
+"""Testing suite for the PyTorch Idefics model."""
 
 import unittest
 
diff --git a/tests/models/idefics/test_modeling_tf_idefics.py b/tests/models/idefics/test_modeling_tf_idefics.py
index eeb3faafa2..ffc4e586ba 100644
--- a/tests/models/idefics/test_modeling_tf_idefics.py
+++ b/tests/models/idefics/test_modeling_tf_idefics.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TF Idefics model. """
+"""Testing suite for the TF Idefics model."""
 
 import os
 import tempfile
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index d932e68b3c..d81711a8ec 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Informer model. """
+"""Testing suite for the PyTorch Informer model."""
 
 import inspect
 import tempfile
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 86aea876fa..e037b5bbe3 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch InstructBLIP model. """
-
+"""Testing suite for the PyTorch InstructBLIP model."""
 
 import inspect
 import tempfile
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
index 0fe515e516..13208d54f1 100644
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Jamba model. """
+"""Testing suite for the PyTorch Jamba model."""
+
 import math
 import tempfile
 import unittest
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index ca944d0df0..66f070ed46 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch KOSMOS-2 model. """
-
+"""Testing suite for the PyTorch KOSMOS-2 model."""
 
 import copy
 import inspect
diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py
index a34811a90b..3ddd6e7660 100644
--- a/tests/models/layoutlm/test_tokenization_layoutlm.py
+++ b/tests/models/layoutlm/test_tokenization_layoutlm.py
@@ -58,14 +58,14 @@ class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         return LayoutLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index d70721e04c..284ba82e3f 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch LayoutLMv2 model. """
-
+"""Testing suite for the PyTorch LayoutLMv2 model."""
 
 import unittest
 
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index ce6bbd0f01..9f9a86a999 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -126,14 +126,14 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -141,7 +141,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_false(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
@@ -149,7 +149,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     def test_basic_tokenizer_lower_strip_accents_true(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
@@ -157,7 +157,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_default(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -165,7 +165,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
@@ -218,7 +218,7 @@ class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index f280633c6a..6cb93d8b42 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch LayoutLMv3 model. """
+"""Testing suite for the PyTorch LayoutLMv3 model."""
 
 import copy
 import unittest
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index 6ae2e5090e..f473eb7580 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow LayoutLMv3 model. """
+"""Testing suite for the TensorFlow LayoutLMv3 model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index 8f1d353efd..bf8e6be498 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -103,7 +103,7 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer.save_pretrained(self.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
diff --git a/tests/models/led/test_modeling_led.py b/tests/models/led/test_modeling_led.py
index 10d944c496..5b459bfd11 100644
--- a/tests/models/led/test_modeling_led.py
+++ b/tests/models/led/test_modeling_led.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch LED model. """
-
+"""Testing suite for the PyTorch LED model."""
 
 import copy
 import tempfile
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 38c9c88594..6303e0365f 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch LeViT model. """
-
+"""Testing suite for the PyTorch LeViT model."""
 
 import unittest
 import warnings
diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py
index 1d1eda3380..7ed35f8361 100644
--- a/tests/models/longformer/test_tokenization_longformer.py
+++ b/tests/models/longformer/test_tokenization_longformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the Longformer tokenizer. """
-
+"""Testing suite for the Longformer tokenizer."""
 
 import itertools
 import json
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index a35d5ec3dc..a9a5d4f938 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch LUKE model. """
+"""Testing suite for the PyTorch LUKE model."""
+
 import unittest
 
 from transformers import LukeConfig, is_torch_available
diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py
index 716e3b971f..9500416c27 100644
--- a/tests/models/lxmert/test_tokenization_lxmert.py
+++ b/tests/models/lxmert/test_tokenization_lxmert.py
@@ -55,14 +55,14 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
diff --git a/tests/models/m2m_100/test_modeling_m2m_100.py b/tests/models/m2m_100/test_modeling_m2m_100.py
index c280f698ba..412ece863f 100644
--- a/tests/models/m2m_100/test_modeling_m2m_100.py
+++ b/tests/models/m2m_100/test_modeling_m2m_100.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch M2M100 model. """
-
+"""Testing suite for the PyTorch M2M100 model."""
 
 import copy
 import tempfile
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index 3144dd48da..89a7358cca 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Marian model. """
+"""Testing suite for the PyTorch Marian model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py
index 370b1c5692..5d74f88fe5 100644
--- a/tests/models/markuplm/test_tokenization_markuplm.py
+++ b/tests/models/markuplm/test_tokenization_markuplm.py
@@ -106,7 +106,7 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         pass
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index d4167cfffe..100cbafa05 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Mask2Former model. """
+"""Testing suite for the PyTorch Mask2Former model."""
 
 import unittest
 
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 6ba48517c3..7b2bec17f4 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MaskFormer model. """
+"""Testing suite for the PyTorch MaskFormer model."""
 
 import copy
 import unittest
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index 8d29e8ebee..01e9ed0e67 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MaskFormer Swin model. """
+"""Testing suite for the PyTorch MaskFormer Swin model."""
 
 import collections
 import unittest
diff --git a/tests/models/mbart/test_modeling_mbart.py b/tests/models/mbart/test_modeling_mbart.py
index 93294d6568..1eaa4c4cdb 100644
--- a/tests/models/mbart/test_modeling_mbart.py
+++ b/tests/models/mbart/test_modeling_mbart.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MBART model. """
-
+"""Testing suite for the PyTorch MBART model."""
 
 import copy
 import tempfile
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index 818f65d80c..2ba1b92c21 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MegatronBERT model. """
-
+"""Testing suite for the PyTorch MegatronBERT model."""
 
 import math
 import os
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index b2c3cb1400..ec1ac5a24a 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MGP-STR model. """
+"""Testing suite for the PyTorch MGP-STR model."""
 
 import unittest
 
diff --git a/tests/models/mgp_str/test_processor_mgp_str.py b/tests/models/mgp_str/test_processor_mgp_str.py
index 3fd7d83534..6a028a2842 100644
--- a/tests/models/mgp_str/test_processor_mgp_str.py
+++ b/tests/models/mgp_str/test_processor_mgp_str.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the MgpstrProcessor. """
+"""Testing suite for the MgpstrProcessor."""
 
 import json
 import os
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index b60062004a..02b203c7ae 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Mixtral model. """
-
+"""Testing suite for the PyTorch Mixtral model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py
index 4d5e09e08d..0a01e2396a 100644
--- a/tests/models/mobilebert/test_tokenization_mobilebert.py
+++ b/tests/models/mobilebert/test_tokenization_mobilebert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the MobileBERT tokenizer. """
-
+"""Testing suite for the MobileBERT tokenizer."""
 
 import os
 import unittest
@@ -73,7 +72,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.get_input_output_texts
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
@@ -81,7 +80,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
 
@@ -93,7 +92,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -112,7 +111,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer(do_lower_case=True)
         rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -131,7 +130,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower
     def test_basic_tokenizer_lower(self):
@@ -140,7 +139,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_false
     def test_basic_tokenizer_lower_strip_accents_false(self):
@@ -149,7 +148,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_true
     def test_basic_tokenizer_lower_strip_accents_true(self):
@@ -158,7 +157,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_default
     def test_basic_tokenizer_lower_strip_accents_default(self):
@@ -167,7 +166,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_no_lower
     def test_basic_tokenizer_no_lower(self):
@@ -222,7 +221,7 @@ class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
index 3be955a729..a1a8b8d9c1 100644
--- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MobileNetV1 model. """
-
+"""Testing suite for the PyTorch MobileNetV1 model."""
 
 import unittest
 
diff --git a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
index f8a1ce8d26..eb3105e427 100644
--- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MobileNetV2 model. """
-
+"""Testing suite for the PyTorch MobileNetV2 model."""
 
 import unittest
 
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index 0fb94f38d6..381763d449 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MobileViT model. """
-
+"""Testing suite for the PyTorch MobileViT model."""
 
 import unittest
 
diff --git a/tests/models/mobilevit/test_modeling_tf_mobilevit.py b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
index 3132b93649..fcad3be021 100644
--- a/tests/models/mobilevit/test_modeling_tf_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_tf_mobilevit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow MobileViT model. """
-
+"""Testing suite for the TensorFlow MobileViT model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
index ff45a8c0b6..340bd22fbb 100644
--- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MobileViTV2 model. """
-
+"""Testing suite for the PyTorch MobileViTV2 model."""
 
 import unittest
 
diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py
index b63dc7ab64..f1049f8ef5 100644
--- a/tests/models/mpnet/test_tokenization_mpnet.py
+++ b/tests/models/mpnet/test_tokenization_mpnet.py
@@ -57,14 +57,14 @@ class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
 
diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py
index a0bf0ec65e..1df002a124 100644
--- a/tests/models/mra/test_modeling_mra.py
+++ b/tests/models/mra/test_modeling_mra.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MRA model. """
-
+"""Testing suite for the PyTorch MRA model."""
 
 import unittest
 
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 8482072d73..e21ccb71f0 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Musicgen model. """
+"""Testing suite for the PyTorch Musicgen model."""
+
 import copy
 import inspect
 import math
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index b32b508258..7587b61abf 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Musicgen Melody model. """
+"""Testing suite for the PyTorch Musicgen Melody model."""
+
 import copy
 import inspect
 import math
diff --git a/tests/models/mvp/test_modeling_mvp.py b/tests/models/mvp/test_modeling_mvp.py
index 225ea4a786..5637d62fd0 100644
--- a/tests/models/mvp/test_modeling_mvp.py
+++ b/tests/models/mvp/test_modeling_mvp.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch MVP model. """
-
+"""Testing suite for the PyTorch MVP model."""
 
 import copy
 import tempfile
diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py
index 6a68311cc6..c04472620b 100644
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Nat model. """
+"""Testing suite for the PyTorch Nat model."""
 
 import collections
 import unittest
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index 6997b896a5..714a1f17cc 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch NLLB-MoE model. """
-
+"""Testing suite for the PyTorch NLLB-MoE model."""
 
 import copy
 import tempfile
diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py
index f5bcb0ba5f..3d812ebf04 100644
--- a/tests/models/nystromformer/test_modeling_nystromformer.py
+++ b/tests/models/nystromformer/test_modeling_nystromformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Nystromformer model. """
-
+"""Testing suite for the PyTorch Nystromformer model."""
 
 import unittest
 
diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py
index 906bd73a70..ee87521c5b 100644
--- a/tests/models/olmo/test_modeling_olmo.py
+++ b/tests/models/olmo/test_modeling_olmo.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch OLMo model. """
+"""Testing suite for the PyTorch OLMo model."""
 
 import unittest
 
diff --git a/tests/models/oneformer/test_modeling_oneformer.py b/tests/models/oneformer/test_modeling_oneformer.py
index 538ab33cbf..9cdc475fae 100644
--- a/tests/models/oneformer/test_modeling_oneformer.py
+++ b/tests/models/oneformer/test_modeling_oneformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch OneFormer model. """
+"""Testing suite for the PyTorch OneFormer model."""
 
 import copy
 import inspect
diff --git a/tests/models/opt/test_modeling_opt.py b/tests/models/opt/test_modeling_opt.py
index 9845fcd469..e9843058f6 100644
--- a/tests/models/opt/test_modeling_opt.py
+++ b/tests/models/opt/test_modeling_opt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch OPT model. """
-
+"""Testing suite for the PyTorch OPT model."""
 
 import copy
 import tempfile
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index d29f8c08c0..eb85a66a1b 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Owlv2 model. """
-
+"""Testing suite for the PyTorch Owlv2 model."""
 
 import inspect
 import os
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 370de65447..64fe046fb9 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch OwlViT model. """
-
+"""Testing suite for the PyTorch OwlViT model."""
 
 import inspect
 import os
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 0653e1057c..ed8d36debe 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch PaliGemma model. """
+"""Testing suite for the PyTorch PaliGemma model."""
 
 import gc
 import unittest
diff --git a/tests/models/patchtsmixer/test_modeling_patchtsmixer.py b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
index 3551263d2c..64a9c1dbc5 100644
--- a/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
+++ b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch PatchTSMixer model. """
+"""Testing suite for the PyTorch PatchTSMixer model."""
 
 import inspect
 import itertools
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
index 78d85590a7..381800e845 100644
--- a/tests/models/patchtst/test_modeling_patchtst.py
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch PatchTST model. """
+"""Testing suite for the PyTorch PatchTST model."""
 
 import inspect
 import random
diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py
index dbc7c9de7b..85e37f2503 100644
--- a/tests/models/pegasus/test_modeling_pegasus.py
+++ b/tests/models/pegasus/test_modeling_pegasus.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch PEGASUS model. """
+"""Testing suite for the PyTorch PEGASUS model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py
index 106a8b39e8..0fdcd0c075 100644
--- a/tests/models/pegasus_x/test_modeling_pegasus_x.py
+++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch PEGASUS-X model. """
-
+"""Testing suite for the PyTorch PEGASUS-X model."""
 
 import copy
 import math
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 86bbc7b49d..530906388f 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Perceiver model. """
+"""Testing suite for the PyTorch Perceiver model."""
 
 import copy
 import inspect
diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py
index ff2b6e68dc..306f594d10 100644
--- a/tests/models/perceiver/test_tokenization_perceiver.py
+++ b/tests/models/perceiver/test_tokenization_perceiver.py
@@ -164,7 +164,7 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
 
@@ -180,7 +180,7 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 46a650c55a..518cb7e037 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Persimmon model. """
-
+"""Testing suite for the PyTorch Persimmon model."""
 
 import gc
 import unittest
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index e3c145bfa2..f395b70c1e 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Testing suite for the PyTorch Phi model. """
-
+"""Testing suite for the PyTorch Phi model."""
 
 import unittest
 
diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
index cc0c00d4e1..ad9c4c46aa 100644
--- a/tests/models/phi3/test_modeling_phi3.py
+++ b/tests/models/phi3/test_modeling_phi3.py
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Testing suite for the PyTorch Phi-3 model. """
-
+"""Testing suite for the PyTorch Phi-3 model."""
 
 import unittest
 
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index c96ea624b6..980fd078ea 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Pix2Struct model. """
+"""Testing suite for the PyTorch Pix2Struct model."""
 
 import copy
 import inspect
diff --git a/tests/models/plbart/test_modeling_plbart.py b/tests/models/plbart/test_modeling_plbart.py
index 998bd6c84f..cd21332ad4 100644
--- a/tests/models/plbart/test_modeling_plbart.py
+++ b/tests/models/plbart/test_modeling_plbart.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch PLBART model. """
-
+"""Testing suite for the PyTorch PLBART model."""
 
 import copy
 import tempfile
diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py
index ca5c3015a7..66216c6a7a 100644
--- a/tests/models/poolformer/test_modeling_poolformer.py
+++ b/tests/models/poolformer/test_modeling_poolformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch PoolFormer model. """
-
+"""Testing suite for the PyTorch PoolFormer model."""
 
 import unittest
 
diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py
index 594f79b083..3a33b5a981 100644
--- a/tests/models/pop2piano/test_modeling_pop2piano.py
+++ b/tests/models/pop2piano/test_modeling_pop2piano.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Pop2Piano model. """
+"""Testing suite for the PyTorch Pop2Piano model."""
 
 import copy
 import tempfile
diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py
index 09390db48b..5eede4d384 100644
--- a/tests/models/prophetnet/test_tokenization_prophetnet.py
+++ b/tests/models/prophetnet/test_tokenization_prophetnet.py
@@ -61,21 +61,21 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -83,7 +83,7 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_false(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
@@ -91,7 +91,7 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     def test_basic_tokenizer_lower_strip_accents_true(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
@@ -99,7 +99,7 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_default(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -107,7 +107,7 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
@@ -170,7 +170,7 @@ class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
index d18a336a4a..feda0adeec 100644
--- a/tests/models/pvt/test_modeling_pvt.py
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Pvt model. """
-
+"""Testing suite for the PyTorch Pvt model."""
 
 import unittest
 
diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py
index ed7f5a3239..96e63834ec 100644
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ b/tests/models/qdqbert/test_modeling_qdqbert.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch QDQBERT model. """
-
+"""Testing suite for the PyTorch QDQBERT model."""
 
 import unittest
 
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
index 54718c4303..f2e66953a7 100644
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Qwen2 model. """
-
+"""Testing suite for the PyTorch Qwen2 model."""
 
 import gc
 import tempfile
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index 48c6ccf78a..13af5b802b 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Qwen2MoE model. """
-
+"""Testing suite for the PyTorch Qwen2MoE model."""
 
 import gc
 import tempfile
diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py
index 4d6d9fd0ff..07a3b9d4b3 100644
--- a/tests/models/realm/test_modeling_realm.py
+++ b/tests/models/realm/test_modeling_realm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch REALM model. """
+"""Testing suite for the PyTorch REALM model."""
 
 import copy
 import unittest
diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py
index f963fb347f..85c478837e 100644
--- a/tests/models/realm/test_tokenization_realm.py
+++ b/tests/models/realm/test_tokenization_realm.py
@@ -65,14 +65,14 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file)
 
-        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
 
@@ -83,7 +83,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -102,7 +102,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer(do_lower_case=True)
         rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -120,7 +120,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -128,7 +128,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_false(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
@@ -136,7 +136,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     def test_basic_tokenizer_lower_strip_accents_true(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
@@ -144,7 +144,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_default(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -152,7 +152,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
@@ -201,7 +201,7 @@ class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index 599161e8d4..b0475b244f 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch RecurrentGemma model. """
+"""Testing suite for the PyTorch RecurrentGemma model."""
+
 import unittest
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, RecurrentGemmaConfig, is_torch_available, set_seed
diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py
index 8840a141fa..78935c4e57 100644
--- a/tests/models/regnet/test_modeling_regnet.py
+++ b/tests/models/regnet/test_modeling_regnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch RegNet model. """
-
+"""Testing suite for the PyTorch RegNet model."""
 
 import unittest
 
diff --git a/tests/models/regnet/test_modeling_tf_regnet.py b/tests/models/regnet/test_modeling_tf_regnet.py
index 70adc9c875..b2998ae070 100644
--- a/tests/models/regnet/test_modeling_tf_regnet.py
+++ b/tests/models/regnet/test_modeling_tf_regnet.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow RegNet model. """
+"""Testing suite for the TensorFlow RegNet model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py
index fe21ae2ecf..664888fcc0 100644
--- a/tests/models/rembert/test_modeling_rembert.py
+++ b/tests/models/rembert/test_modeling_rembert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch RemBERT model. """
-
+"""Testing suite for the PyTorch RemBERT model."""
 
 import unittest
 
diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py
index 096106a2fc..46794733f3 100644
--- a/tests/models/rembert/test_tokenization_rembert.py
+++ b/tests/models/rembert/test_tokenization_rembert.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the RemBert tokenizer. """
-
+"""Testing suite for the RemBert tokenizer."""
 
 import tempfile
 import unittest
diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py
index 0fd32bc72f..5595ebde69 100644
--- a/tests/models/resnet/test_modeling_resnet.py
+++ b/tests/models/resnet/test_modeling_resnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ResNet model. """
-
+"""Testing suite for the PyTorch ResNet model."""
 
 import unittest
 
diff --git a/tests/models/resnet/test_modeling_tf_resnet.py b/tests/models/resnet/test_modeling_tf_resnet.py
index a8e2ce93ee..f5762c7225 100644
--- a/tests/models/resnet/test_modeling_tf_resnet.py
+++ b/tests/models/resnet/test_modeling_tf_resnet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the Tensorflow ResNet model. """
-
+"""Testing suite for the Tensorflow ResNet model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index d52304ade9..55dc8453f4 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch RoCBert model. """
+"""Testing suite for the PyTorch RoCBert model."""
 
 import unittest
 
diff --git a/tests/models/roc_bert/test_tokenization_roc_bert.py b/tests/models/roc_bert/test_tokenization_roc_bert.py
index 4fb2b172d6..fdd95a033a 100644
--- a/tests/models/roc_bert/test_tokenization_roc_bert.py
+++ b/tests/models/roc_bert/test_tokenization_roc_bert.py
@@ -73,7 +73,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_chinese(self):
         tokenizer = RoCBertBasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower(self):
@@ -82,7 +82,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_false with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_false(self):
@@ -91,7 +91,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_true with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_true(self):
@@ -100,7 +100,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_lower_strip_accents_default with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_lower_strip_accents_default(self):
@@ -109,7 +109,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     # Copied from tests.models.bert.test_tokenization_bert.BertTokenizationTest.test_basic_tokenizer_no_lower with BasicTokenizer->RoCBertBasicTokenizer
     def test_basic_tokenizer_no_lower(self):
@@ -164,7 +164,7 @@ class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index 64ce38c515..1c22243b37 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch RoFormer model. """
-
+"""Testing suite for the PyTorch RoFormer model."""
 
 import unittest
 
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index ee9eeefd33..1590d4f806 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SAM model. """
-
+"""Testing suite for the PyTorch SAM model."""
 
 import gc
 import unittest
diff --git a/tests/models/sam/test_modeling_tf_sam.py b/tests/models/sam/test_modeling_tf_sam.py
index d742a9b085..81fe6f1eaf 100644
--- a/tests/models/sam/test_modeling_tf_sam.py
+++ b/tests/models/sam/test_modeling_tf_sam.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow SAM model. """
-
+"""Testing suite for the TensorFlow SAM model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index 925d734293..9445bf46fc 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SeamlessM4T model. """
-
+"""Testing suite for the PyTorch SeamlessM4T model."""
 
 import copy
 import tempfile
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index b36e29d792..f450dca519 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SeamlessM4Tv2 model. """
-
+"""Testing suite for the PyTorch SeamlessM4Tv2 model."""
 
 import copy
 import tempfile
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index fb38338513..f0f125933b 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SegFormer model. """
-
+"""Testing suite for the PyTorch SegFormer model."""
 
 import unittest
 
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 16b5740a08..89a28925c2 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow SegFormer model. """
+"""Testing suite for the TensorFlow SegFormer model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/seggpt/test_modeling_seggpt.py b/tests/models/seggpt/test_modeling_seggpt.py
index efa0231c1e..46c370db77 100644
--- a/tests/models/seggpt/test_modeling_seggpt.py
+++ b/tests/models/seggpt/test_modeling_seggpt.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SegGpt model. """
-
+"""Testing suite for the PyTorch SegGpt model."""
 
 import inspect
 import math
diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py
index 5342df9e08..2394ed015d 100644
--- a/tests/models/sew/test_modeling_sew.py
+++ b/tests/models/sew/test_modeling_sew.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Hubert model. """
-
+"""Testing suite for the PyTorch Hubert model."""
 
 import math
 import unittest
diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py
index 1980bd3ab1..a9af497226 100644
--- a/tests/models/sew_d/test_modeling_sew_d.py
+++ b/tests/models/sew_d/test_modeling_sew_d.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Hubert model. """
-
+"""Testing suite for the PyTorch Hubert model."""
 
 import math
 import unittest
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index e0a4825d0e..d65416a3fa 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SigLIP model. """
-
+"""Testing suite for the PyTorch SigLIP model."""
 
 import inspect
 import os
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 5d0e8f3a07..183120b820 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Speech2Text model. """
+"""Testing suite for the PyTorch Speech2Text model."""
 
 import copy
 import inspect
diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
index badde2485d..c2fd215f38 100644
--- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow Speech2Text model. """
+"""Testing suite for the TensorFlow Speech2Text model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
index cbb449c6e7..fffa16aa30 100644
--- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
+++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Speech2Text model. """
+"""Testing suite for the PyTorch Speech2Text model."""
 
 import unittest
 
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 622ae196bd..6badcfa060 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SpeechT5 model. """
+"""Testing suite for the PyTorch SpeechT5 model."""
 
 import copy
 import inspect
diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py
index b6a2588c87..b62571f189 100644
--- a/tests/models/splinter/test_modeling_splinter.py
+++ b/tests/models/splinter/test_modeling_splinter.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Splinter model. """
+"""Testing suite for the PyTorch Splinter model."""
 
 import copy
 import unittest
diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py
index 083f928612..b0d7261de6 100644
--- a/tests/models/stablelm/test_modeling_stablelm.py
+++ b/tests/models/stablelm/test_modeling_stablelm.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch StableLm model. """
-
+"""Testing suite for the PyTorch StableLm model."""
 
 import unittest
 
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index faba4d254b..7b05e5c165 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Starcoder2 model. """
-
+"""Testing suite for the PyTorch Starcoder2 model."""
 
 import tempfile
 import unittest
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index c54e092809..ea90023b19 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch SwiftFormer model. """
-
+"""Testing suite for the PyTorch SwiftFormer model."""
 
 import copy
 import unittest
diff --git a/tests/models/swiftformer/test_modeling_tf_swiftformer.py b/tests/models/swiftformer/test_modeling_tf_swiftformer.py
index e73d38605d..b05e390597 100644
--- a/tests/models/swiftformer/test_modeling_tf_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_tf_swiftformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow SwiftFormer model. """
-
+"""Testing suite for the TensorFlow SwiftFormer model."""
 
 import inspect
 import unittest
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 699171722d..99917adafc 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Swin model. """
+"""Testing suite for the PyTorch Swin model."""
 
 import collections
 import unittest
diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py
index f05ef7a434..6a443571d3 100644
--- a/tests/models/swin/test_modeling_tf_swin.py
+++ b/tests/models/swin/test_modeling_tf_swin.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TF 2.0 Swin model. """
-
+"""Testing suite for the TF 2.0 Swin model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index 44ca7b9249..2c09be2760 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Swin2SR model. """
+"""Testing suite for the PyTorch Swin2SR model."""
+
 import unittest
 
 from transformers import Swin2SRConfig
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index 7a948d1282..c0ab1081f4 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Swinv2 model. """
+"""Testing suite for the PyTorch Swinv2 model."""
+
 import collections
 import inspect
 import unittest
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 989517eb8c..e41b53a21f 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Table Transformer model. """
-
+"""Testing suite for the PyTorch Table Transformer model."""
 
 import inspect
 import math
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index 8f2bf9bb69..b64eec06d0 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -143,7 +143,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
@@ -189,7 +189,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -208,7 +208,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = self.get_tokenizer(do_lower_case=True)
         rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
 
-        sequence = "UNwant\u00E9d,running"
+        sequence = "UNwant\u00e9d,running"
 
         tokens = tokenizer.tokenize(sequence)
         rust_tokens = rust_tokenizer.tokenize(sequence)
@@ -230,7 +230,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def test_chinese(self):
         tokenizer = BasicTokenizer()
 
-        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
 
     def test_basic_tokenizer_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -238,7 +238,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_false(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
@@ -246,7 +246,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
 
     def test_basic_tokenizer_lower_strip_accents_true(self):
         tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
@@ -254,7 +254,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_lower_strip_accents_default(self):
         tokenizer = BasicTokenizer(do_lower_case=True)
@@ -262,7 +262,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertListEqual(
             tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
 
     def test_basic_tokenizer_no_lower(self):
         tokenizer = BasicTokenizer(do_lower_case=False)
@@ -311,7 +311,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertTrue(_is_whitespace("\t"))
         self.assertTrue(_is_whitespace("\r"))
         self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00A0"))
+        self.assertTrue(_is_whitespace("\u00a0"))
 
         self.assertFalse(_is_whitespace("A"))
         self.assertFalse(_is_whitespace("-"))
@@ -935,7 +935,7 @@ class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 table = self.get_table(tokenizer, length=0)
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 before_tokens = tokenizer.encode(table, sample_text, add_special_tokens=False)
                 before_vocab = tokenizer.get_vocab()
                 tokenizer.save_pretrained(tmpdirname)
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index 330cf95d06..a2694fd0b3 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch TimeSeriesTransformer model. """
+"""Testing suite for the PyTorch TimeSeriesTransformer model."""
 
 import inspect
 import tempfile
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 3d97d2c0f6..e3c6d7fad3 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch TimeSformer model. """
-
+"""Testing suite for the PyTorch TimeSformer model."""
 
 import copy
 import unittest
diff --git a/tests/models/trocr/test_modeling_trocr.py b/tests/models/trocr/test_modeling_trocr.py
index da24c7dd43..854ca22567 100644
--- a/tests/models/trocr/test_modeling_trocr.py
+++ b/tests/models/trocr/test_modeling_trocr.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch TrOCR model. """
+"""Testing suite for the PyTorch TrOCR model."""
 
 import unittest
 
diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py
index cd737d5a8f..a0b3f7a916 100644
--- a/tests/models/tvlt/test_feature_extraction_tvlt.py
+++ b/tests/models/tvlt/test_feature_extraction_tvlt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TVLT feature extraction. """
+"""Testing suite for the TVLT feature extraction."""
 
 import itertools
 import random
diff --git a/tests/models/tvlt/test_image_processor_tvlt.py b/tests/models/tvlt/test_image_processor_tvlt.py
index 8677a86665..c2974da6d8 100644
--- a/tests/models/tvlt/test_image_processor_tvlt.py
+++ b/tests/models/tvlt/test_image_processor_tvlt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TVLT image processor. """
+"""Testing suite for the TVLT image processor."""
 
 import unittest
 
diff --git a/tests/models/tvlt/test_modeling_tvlt.py b/tests/models/tvlt/test_modeling_tvlt.py
index ce27946ee7..574559a7a2 100644
--- a/tests/models/tvlt/test_modeling_tvlt.py
+++ b/tests/models/tvlt/test_modeling_tvlt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch TVLT model. """
+"""Testing suite for the PyTorch TVLT model."""
 
 import copy
 import inspect
diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py
index c7bcc148a1..90050c3bdf 100644
--- a/tests/models/tvp/test_modeling_tvp.py
+++ b/tests/models/tvp/test_modeling_tvp.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch TVP model. """
-
+"""Testing suite for the PyTorch TVP model."""
 
 import unittest
 
diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py
index d022128ed1..8bea9880b0 100644
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -102,7 +102,7 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer.save_pretrained(self.tmpdirname)
 
     def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00E9d,running"
+        input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index a286274828..7735fe4ffa 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch UniSpeech model. """
+"""Testing suite for the PyTorch UniSpeech model."""
 
 import math
 import unittest
diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
index 3e71449357..a48ed36888 100644
--- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
+++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch UniSpeechSat model. """
+"""Testing suite for the PyTorch UniSpeechSat model."""
 
 import math
 import unittest
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 234cd8af09..79fda279fa 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch UperNet framework. """
-
+"""Testing suite for the PyTorch UperNet framework."""
 
 import unittest
 
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 52a75e8a05..1a91a2660f 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VideoLlava model. """
+"""Testing suite for the PyTorch VideoLlava model."""
 
 import gc
 import unittest
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 425fe2bcd7..cef2ac33ae 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VideoMAE model. """
-
+"""Testing suite for the PyTorch VideoMAE model."""
 
 import copy
 import unittest
diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py
index 3e25fc3bba..194b4dd7ba 100644
--- a/tests/models/vilt/test_modeling_vilt.py
+++ b/tests/models/vilt/test_modeling_vilt.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViLT model. """
+"""Testing suite for the PyTorch ViLT model."""
 
 import unittest
 
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index ff84f71784..af278643fd 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VipLlava model. """
+"""Testing suite for the PyTorch VipLlava model."""
 
 import gc
 import unittest
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index 171f33d680..f2bd9c1243 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow VisionEncoderDecoder model. """
-
+"""Testing suite for the TensorFlow VisionEncoderDecoder model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
index ddf8b4335f..e57a4bd4db 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VisionTextDualEncoder model. """
-
+"""Testing suite for the PyTorch VisionTextDualEncoder model."""
 
 import collections
 import tempfile
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
index 10baef0059..eeb8951edb 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_tf_vision_text_dual_encoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VisionTextDualEncoder model. """
-
+"""Testing suite for the PyTorch VisionTextDualEncoder model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
index 4a1ee2462e..88e06178e5 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VisionTextDualEncoder model. """
-
+"""Testing suite for the PyTorch VisionTextDualEncoder model."""
 
 import collections
 import tempfile
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index 249ccdd84b..d24ea14b65 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VisualBERT model. """
+"""Testing suite for the PyTorch VisualBERT model."""
 
 import copy
 import unittest
diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py
index 2c06b0bc60..a13119436f 100644
--- a/tests/models/vit/test_modeling_tf_vit.py
+++ b/tests/models/vit/test_modeling_tf_vit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow ViT model. """
-
+"""Testing suite for the TensorFlow ViT model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py
index e150bd0be9..2a7886c5a6 100644
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViT model. """
-
+"""Testing suite for the PyTorch ViT model."""
 
 import unittest
 
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index b3d0040c22..043dcb4de4 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViT Hybrid model. """
-
+"""Testing suite for the PyTorch ViT Hybrid model."""
 
 import unittest
 
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 4221d6bfd3..5c27e5ac80 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow ViTMAE model. """
-
+"""Testing suite for the TensorFlow ViTMAE model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 6c981adeb8..0357a3ebda 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViTMAE model. """
-
+"""Testing suite for the PyTorch ViTMAE model."""
 
 import math
 import tempfile
diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py
index be0857181e..f911749c82 100644
--- a/tests/models/vit_msn/test_modeling_vit_msn.py
+++ b/tests/models/vit_msn/test_modeling_vit_msn.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViTMSN model. """
-
+"""Testing suite for the PyTorch ViTMSN model."""
 
 import unittest
 
diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py
index 2df1b79257..49076afb4d 100644
--- a/tests/models/vitdet/test_modeling_vitdet.py
+++ b/tests/models/vitdet/test_modeling_vitdet.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViTDet model. """
-
+"""Testing suite for the PyTorch ViTDet model."""
 
 import unittest
 
diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py
index 4a8e85160b..ccdefe957c 100644
--- a/tests/models/vitmatte/test_modeling_vitmatte.py
+++ b/tests/models/vitmatte/test_modeling_vitmatte.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VitMatte model. """
-
+"""Testing suite for the PyTorch VitMatte model."""
 
 import unittest
 
diff --git a/tests/models/vits/test_modeling_vits.py b/tests/models/vits/test_modeling_vits.py
index b83165aff4..791071e096 100644
--- a/tests/models/vits/test_modeling_vits.py
+++ b/tests/models/vits/test_modeling_vits.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch VITS model. """
+"""Testing suite for the PyTorch VITS model."""
 
 import copy
 import os
diff --git a/tests/models/vits/test_tokenization_vits.py b/tests/models/vits/test_tokenization_vits.py
index fee6bac3a4..a71c5a05c4 100644
--- a/tests/models/vits/test_tokenization_vits.py
+++ b/tests/models/vits/test_tokenization_vits.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for the VITS tokenizer."""
+
 import json
 import os
 import shutil
@@ -87,7 +88,7 @@ class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 before_vocab = tokenizer.get_vocab()
                 tokenizer.save_pretrained(tmpdirname)
diff --git a/tests/models/vivit/test_modeling_vivit.py b/tests/models/vivit/test_modeling_vivit.py
index cbb45731ae..dd13d0306a 100644
--- a/tests/models/vivit/test_modeling_vivit.py
+++ b/tests/models/vivit/test_modeling_vivit.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch ViViT model. """
-
+"""Testing suite for the PyTorch ViViT model."""
 
 import copy
 import inspect
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 9d86fb245c..eed419910d 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Wav2Vec2 model. """
+"""Testing suite for the PyTorch Wav2Vec2 model."""
 
 import gc
 import math
diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
index a4a0a95972..8fae3a4bad 100644
--- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
+++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Wav2Vec2-BERT model. """
+"""Testing suite for the PyTorch Wav2Vec2-BERT model."""
+
 import tempfile
 import unittest
 
diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
index 5c7bfd0a9b..020f61c82f 100644
--- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
+++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Wav2Vec2-Conformer model. """
+"""Testing suite for the PyTorch Wav2Vec2-Conformer model."""
+
 import math
 import tempfile
 import unittest
diff --git a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
index ea81c88ede..d3207cee1d 100644
--- a/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
+++ b/tests/models/wav2vec2_phoneme/test_tokenization_wav2vec2_phoneme.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for the Wav2Vec2Phoneme tokenizer."""
+
 import json
 import os
 import unittest
diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py
index 3cf4348f6c..2e8754d19f 100644
--- a/tests/models/wavlm/test_modeling_wavlm.py
+++ b/tests/models/wavlm/test_modeling_wavlm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch WavLM model. """
+"""Testing suite for the PyTorch WavLM model."""
 
 import math
 import unittest
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index 9d1a329934..e2df1b1397 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the TensorFlow Whisper model. """
+"""Testing suite for the TensorFlow Whisper model."""
 
 from __future__ import annotations
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 58acb5f2fd..70e37d7649 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch Whisper model. """
+"""Testing suite for the PyTorch Whisper model."""
 
 import copy
 import inspect
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index fc5c1679a6..3f7628a945 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch XCLIP model. """
-
+"""Testing suite for the PyTorch XCLIP model."""
 
 import inspect
 import os
diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py
index bd65e6c80b..32dd4685c8 100644
--- a/tests/models/xlnet/test_tokenization_xlnet.py
+++ b/tests/models/xlnet/test_tokenization_xlnet.py
@@ -154,7 +154,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                 ".",
             ],
         )
-        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"])
+        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["▁he", "ll", "o"])
 
     def test_tokenizer_no_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index 9c145388ff..f8edfc7ff8 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch YOLOS model. """
-
+"""Testing suite for the PyTorch YOLOS model."""
 
 import unittest
 
diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py
index ca41b074bc..f7fa4ea31b 100644
--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Testing suite for the PyTorch YOSO model. """
-
+"""Testing suite for the PyTorch YOSO model."""
 
 import unittest
 
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index fd8b36fc9a..126659f0bc 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE."""
+"""Finetuning the library models for sequence classification on GLUE."""
 # You can also adapt this script on your own text classification task. Pointers for this are left as comments.
 
 import logging
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 76402cd092..56814b74f5 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -709,7 +709,7 @@ class TokenizerTesterMixin:
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 before_vocab = tokenizer.get_vocab()
                 tokenizer.save_pretrained(tmpdirname)
@@ -728,7 +728,7 @@ class TokenizerTesterMixin:
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
@@ -764,7 +764,7 @@ class TokenizerTesterMixin:
                 # Isolate this from the other tests because we save additional tokens/etc
                 tmpdirname = tempfile.mkdtemp()
 
-                sample_text = " He is very happy, UNwant\u00E9d,running"
+                sample_text = " He is very happy, UNwant\u00e9d,running"
                 tokenizer.add_tokens(["bim", "bambam"])
                 additional_special_tokens = tokenizer.additional_special_tokens
                 additional_special_tokens.append("new_additional_special_token")
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index e5838dd4a3..7ff6b29629 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -15,6 +15,7 @@
 """
 isort:skip_file
 """
+
 import os
 import pickle
 import tempfile
diff --git a/utils/add_pipeline_model_mapping_to_test.py b/utils/add_pipeline_model_mapping_to_test.py
index ebefcff9af..e67f65f824 100644
--- a/utils/add_pipeline_model_mapping_to_test.py
+++ b/utils/add_pipeline_model_mapping_to_test.py
@@ -24,7 +24,6 @@ This script will be (mostly) used in the following 2 situations:
 This script is **NOT** intended to be run (manually) by community contributors.
 """
 
-
 import argparse
 import glob
 import inspect
diff --git a/utils/check_copies.py b/utils/check_copies.py
index dd5d5c77da..c4fa2fbaa0 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -753,9 +753,9 @@ def is_copy_consistent(filename: str, overwrite: bool = False, buffer: dict = No
                 else:
                     # not in the target --> add it
                     theoretical_code_blocks[f"_ignored_new_block_{ignored_new_block_index}"] = code
-                    name_mappings_1[
+                    name_mappings_1[f"_ignored_new_block_{ignored_new_block_index}"] = (
                         f"_ignored_new_block_{ignored_new_block_index}"
-                    ] = f"_ignored_new_block_{ignored_new_block_index}"
+                    )
 
                     del observed_code_blocks[name]
                     observed_code_blocks[f"_ignored_new_block_{ignored_new_block_index}"] = code
diff --git a/utils/check_doc_toc.py b/utils/check_doc_toc.py
index ccbff5e0b6..f90ffc2cc9 100644
--- a/utils/check_doc_toc.py
+++ b/utils/check_doc_toc.py
@@ -31,7 +31,6 @@ python utils/check_doc_toc.py --fix_and_overwrite
 ```
 """
 
-
 import argparse
 from collections import defaultdict
 from typing import List
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 04572d132b..ed57cd308e 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -32,6 +32,7 @@ python utils/check_docstrings.py --fix_and_overwrite
 which is used by `make fix-copies` (note that this fills what it cans, you might have to manually fill information
 like argument descriptions).
 """
+
 import argparse
 import ast
 import enum
diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py
index f39895ff52..1d763475cb 100644
--- a/utils/check_doctest_list.py
+++ b/utils/check_doctest_list.py
@@ -30,6 +30,7 @@ Auto-sort the doctest list if it is not properly sorted (used in `make fix-copie
 python utils/check_doctest_list.py --fix_and_overwrite
 ```
 """
+
 import argparse
 import os
 
diff --git a/utils/check_dummies.py b/utils/check_dummies.py
index a3ab6ebfa7..e66d69ada1 100644
--- a/utils/check_dummies.py
+++ b/utils/check_dummies.py
@@ -33,6 +33,7 @@ Update the dummy files if needed (used in `make fix-copies`):
 python utils/check_dummies.py --fix_and_overwrite
 ```
 """
+
 import argparse
 import os
 import re
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 13dcd6ad97..dcb1374d8e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -30,6 +30,7 @@ python utils/check_repo.py
 
 It has no auto-fix mode.
 """
+
 import inspect
 import os
 import re
diff --git a/utils/check_support_list.py b/utils/check_support_list.py
index 3cb0b61602..89e1bcf9d6 100644
--- a/utils/check_support_list.py
+++ b/utils/check_support_list.py
@@ -25,6 +25,7 @@ python utils/check_support_list.py
 
 It has no auto-fix mode.
 """
+
 import os
 from glob import glob
 
diff --git a/utils/check_table.py b/utils/check_table.py
index 9c9318ca85..0866f6bf61 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -31,6 +31,7 @@ python utils/check_inits.py --fix_and_overwrite
 
 which is used by `make fix-copies`.
 """
+
 import argparse
 import collections
 import os
diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py
index cdf2aaffdf..7adf804eaf 100644
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@@ -34,6 +34,7 @@ For a check only (as used in `make quality`) run:
 python utils/custom_init_isort.py --check_only
 ```
 """
+
 import argparse
 import os
 import re
diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py
index ef482d47de..22e9fcae47 100644
--- a/utils/download_glue_data.py
+++ b/utils/download_glue_data.py
@@ -1,4 +1,4 @@
-""" Script for downloading all GLUE data.
+"""Script for downloading all GLUE data.
 Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
 
 Note: for legal reasons, we are unable to host MRPC.
diff --git a/utils/models_to_deprecate.py b/utils/models_to_deprecate.py
index ebdecf22eb..17ea1fd28e 100644
--- a/utils/models_to_deprecate.py
+++ b/utils/models_to_deprecate.py
@@ -14,6 +14,7 @@
 """
 Script to find a candidate list of models to deprecate based on the number of downloads and the date of the last commit.
 """
+
 import argparse
 import glob
 import json
diff --git a/utils/release.py b/utils/release.py
index dbddba2ac5..b0349a80b4 100644
--- a/utils/release.py
+++ b/utils/release.py
@@ -41,6 +41,7 @@ python release.py --post_release
 
 or use `make post-release`.
 """
+
 import argparse
 import os
 import re
diff --git a/utils/sort_auto_mappings.py b/utils/sort_auto_mappings.py
index 8871b6807d..42382b6dfe 100644
--- a/utils/sort_auto_mappings.py
+++ b/utils/sort_auto_mappings.py
@@ -29,6 +29,7 @@ To only check if the mappings are properly sorted (as used in `make quality`), d
 python utils/sort_auto_mappings.py --check_only
 ```
 """
+
 import argparse
 import os
 import re
diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index 2296d2998f..1806eb3f03 100755
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -28,6 +28,7 @@ that new pipelines are properly added as metadata (as used in `make repo-consist
 python utils/update_metadata.py --check-only
 ```
 """
+
 import argparse
 import collections
 import os
diff --git a/utils/update_tiny_models.py b/utils/update_tiny_models.py
index 2e40990691..2cd8f72d56 100644
--- a/utils/update_tiny_models.py
+++ b/utils/update_tiny_models.py
@@ -20,7 +20,6 @@ version of `tests/utils/tiny_model_summary.json`. That updated file should be me
 `transformers` so the pipeline testing will use the latest created/updated tiny models.
 """
 
-
 import argparse
 import copy
 import json

From 8e8786e5f04d7bba787ad7876630c72e20092016 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 22 May 2024 10:52:59 +0200
Subject: [PATCH 29/80] Update build ci image [push-ci-image] (#30933)

* [build-ci-image]

* correct branch

* push ci image

* [build-ci-image]

* update scheduled as well

* [push-ci-image]

* [build-ci-image]

* [push-ci-image]

* update deps

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* oups [build-ci-image]

* [push-ci-image]

* fix

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* updated

* [build-ci-image] update tag

* [build-ci-image]

* [build-ci-image]

* fix tag

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* github name

* commit_title?

* fetch

* update

* it not found

* dev

* dev

* [push-ci-image]

* dev

* dev

* update

* dev

* dev print dev commit message dev

* dev ? dev

* dev

* dev

* dev

* dev

* [build-ci-image]

* [build-ci-image]

* [push-ci-image]

* revert unwanted

* revert convert as well

* no you are not important

* [build-ci-image]

* Update .circleci/config.yml

* pin tf probability dev
---
 .circleci/config.yml                          |  3 ++-
 .circleci/create_circleci_config.py           |  6 ++++++
 .github/workflows/build-ci-docker-images.yml  | 18 ++++++++++++++----
 docker/consistency.dockerfile                 |  3 ++-
 docker/jax-light.dockerfile                   |  3 ++-
 docker/pipeline-tf.dockerfile                 |  3 ++-
 docker/pipeline-torch.dockerfile              |  3 ++-
 docker/quality.dockerfile                     |  3 ++-
 docker/tf-light.dockerfile                    |  3 ++-
 docker/torch-jax-light.dockerfile             |  3 ++-
 docker/torch-light.dockerfile                 |  3 ++-
 setup.py                                      |  6 +++---
 src/transformers/dependency_versions_table.py |  4 ++--
 13 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index ddb8869a6a..9d04cfb941 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -31,6 +31,7 @@ jobs:
         steps:
             - checkout
             - run: uv pip install -U -e .
+            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
             - run: mkdir -p test_preparation
             - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
             - store_artifacts:
@@ -80,7 +81,7 @@ jobs:
                   path: ~/transformers/test_preparation/filtered_test_list.txt
             - store_artifacts:
                   path: test_preparation/examples_test_list.txt
-            - run: python .circleci/create_circleci_config.py --fetcher_folder test_preparation
+            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
             - run: |
                   if [ ! -s test_preparation/generated_config.yml ]; then
                       echo "No tests to run, exiting early!"
diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 8b6a773ee4..1427b08a96 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -72,6 +72,12 @@ class CircleCIJob:
         if self.docker_image is None:
             # Let's avoid changing the default list and make a copy.
             self.docker_image = copy.deepcopy(DEFAULT_DOCKER_IMAGE)
+        else:
+            # BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
+            print(os.environ.get("GIT_COMMIT_MESSAGE"))
+            if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or "dev" in  os.environ.get("GIT_COMMIT_MESSAGE", ""):
+                self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
+            print(f"Using {self.docker_image} docker image")
         if self.install_steps is None:
             self.install_steps = []
         if self.pytest_options is None:
diff --git a/.github/workflows/build-ci-docker-images.yml b/.github/workflows/build-ci-docker-images.yml
index f7b75a3a30..6d6df220e4 100644
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@@ -3,7 +3,7 @@ name: Build pr ci-docker
 on:
   push:
     branches:
-      - change-ci # for now let's only build on this branch
+      - push-ci-image # for now let's only build on this branch
   repository_dispatch:
   workflow_call:
     inputs:
@@ -22,7 +22,7 @@ jobs:
   build:
     runs-on: ubuntu-22.04
 
-    if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' }}
+    if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }}
 
     strategy:
       matrix:
@@ -30,6 +30,16 @@ jobs:
     continue-on-error: true 
 
     steps:
+      - 
+        name: Set tag
+        run: |
+              if [ ${{contains(github.event.head_commit.message, '[build-ci-image]')}} ]; then
+                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV" 
+                  echo "setting it to DEV!"
+              else
+                  echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
+                  
+              fi
       -
         name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
@@ -50,5 +60,5 @@ jobs:
           build-args: |
             REF=${{ github.sha }}
           file: "./docker/${{ matrix.file }}.dockerfile"
-          push: true
-          tags: huggingface/transformers-${{ matrix.file }}
\ No newline at end of file
+          push: ${{ contains(github.event.head_commit.message, 'ci-image]') ||  github.event_name == 'schedule' }}
+          tags: ${{ env.TAG }}
\ No newline at end of file
diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile
index fa94259e2d..6bdcefc458 100644
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@@ -1,12 +1,13 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
+ARG REF=main
 RUN apt-get update && apt-get install -y time git pkg-config make git-lfs
 ENV VIRTUAL_ENV=/usr/local
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
 RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir tensorflow-cpu tf-keras
-RUN uv pip install --no-cache-dir "transformers[flax,quality,vision,testing]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]" 
 RUN git lfs install
 
 RUN pip uninstall -y transformers
diff --git a/docker/jax-light.dockerfile b/docker/jax-light.dockerfile
index 8383330628..d9ca5948f9 100644
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@@ -1,9 +1,10 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
 ENV VIRTUAL_ENV=/usr/local
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"
+RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
 RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
diff --git a/docker/pipeline-tf.dockerfile b/docker/pipeline-tf.dockerfile
index 81af0ea9c1..3c83a254c6 100644
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@@ -1,9 +1,10 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
 ENV VIRTUAL_ENV=/usr/local
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install --no-cache-dir "transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
+RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile
index 554d978314..a496777c2c 100644
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@@ -1,10 +1,11 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV VIRTUAL_ENV=/usr/local
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN pip uninstall -y transformers
\ No newline at end of file
diff --git a/docker/quality.dockerfile b/docker/quality.dockerfile
index 471af0526b..631467a599 100644
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@@ -1,8 +1,9 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y time git 
 ENV VIRTUAL_ENV=/usr/local
 RUN pip install uv &&  uv venv
-RUN uv pip install --no-cache-dir -U pip setuptools GitPython transformers "ruff==0.1.5" urllib3
+RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
 RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/tf-light.dockerfile b/docker/tf-light.dockerfile
index 23dcd40db2..894f55870d 100644
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@@ -1,11 +1,12 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
 RUN apt-get install -y  cmake
 ENV VIRTUAL_ENV=/usr/local
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
 RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
diff --git a/docker/torch-jax-light.dockerfile b/docker/torch-jax-light.dockerfile
index ef28563ec1..cfdbef788b 100644
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@@ -1,12 +1,13 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV VIRTUAL_ENV=/usr/local
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax, audio, sklearn,sentencepiece,vision,testing]"
+RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
 
 
 # RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"
diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile
index 2172d66ca8..2c169d5e6d 100644
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@@ -1,10 +1,11 @@
 FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV VIRTUAL_ENV=/usr/local
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN pip uninstall -y transformers
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 9ad7c84b58..3d6c78fd9a 100644
--- a/setup.py
+++ b/setup.py
@@ -173,10 +173,10 @@ _deps = [
     "tensorflow-cpu>2.9,<2.16",
     "tensorflow>2.9,<2.16",
     "tensorflow-text<2.16",
-    "tensorflow-probability<2.16",
+    "tensorflow-probability<0.24",
     "tf2onnx",
     "timeout-decorator",
-    "timm",
+    "timm<=0.9.16",
     "tokenizers>=0.19,<0.20",
     "torch",
     "torchaudio",
@@ -339,7 +339,7 @@ extras["testing"] = (
 )
 
 extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
-
+extras["ruff"] = deps_list("ruff")
 extras["quality"] = deps_list("datasets", "isort", "ruff", "GitPython", "urllib3")
 
 extras["all"] = (
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 2e48650b9c..29c916aff6 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -78,10 +78,10 @@ deps = {
     "tensorflow-cpu": "tensorflow-cpu>2.9,<2.16",
     "tensorflow": "tensorflow>2.9,<2.16",
     "tensorflow-text": "tensorflow-text<2.16",
-    "tensorflow-probability": "tensorflow-probability<2.16",
+    "tensorflow-probability": "tensorflow-probability<0.24",
     "tf2onnx": "tf2onnx",
     "timeout-decorator": "timeout-decorator",
-    "timm": "timm",
+    "timm": "timm<=0.9.16",
     "tokenizers": "tokenizers>=0.19,<0.20",
     "torch": "torch",
     "torchaudio": "torchaudio",

From edb14eba641bbbfc4170b4341475ed0e80748e49 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 22 May 2024 11:27:41 +0100
Subject: [PATCH 30/80] Bump requests from 2.31.0 to 2.32.2 in
 /examples/research_projects/lxmert (#30956)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/lxmert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt
index 60a1858b4e..1a3c80e086 100644
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -75,7 +75,7 @@ pyzmq==19.0.2
 qtconsole==4.7.7
 QtPy==1.9.0
 regex==2020.7.14
-requests==2.31.0
+requests==2.32.2
 retrying==1.3.3
 sacremoses==0.0.43
 Send2Trash==1.5.0

From 934e1b84e956ea74323b92eb86300126b8139a42 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Wed, 22 May 2024 16:56:41 +0500
Subject: [PATCH 31/80] Update video-llava docs (#30935)

* update video-llava

* Update docs/source/en/model_doc/video_llava.md

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/model_doc/video_llava.md       | 96 ++++++++++++++++---
 .../video_llava/modeling_video_llava.py       | 14 +--
 2 files changed, 91 insertions(+), 19 deletions(-)

diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md
index 7f27e65a4e..307c55bb2c 100644
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@@ -42,21 +42,28 @@ a unified visual representation, outperforming models designed specifically for
 work to provide modest insights into the multi-modal inputs
 for the LLM*
 
-Tips:
+## Usage tips:
 
 - We advise users to use padding_side="left" when computing batched generation as it leads to more accurate results. Simply make sure to call processor.tokenizer.padding_side = "left" before generating.
 
 - Note the model has not been explicitly trained to process multiple images/videos in the same prompt, although this is technically possible, you may experience inaccurate results.
 
-- For better results, we recommend users prompt the model with the correct prompt format:
+- Note that the video inputs should have exactly 8 frames at the input, since the models were trained in that setting. 
 
+This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
+The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA).
+
+
+## Usage example
+
+### Single Media Mode
+
+The model can accept both images and videos as input. Here's an example code for inference in half-precision (`torch.float16`):
 
 ```python
 import av
 import torch
 import numpy as np
-import requests
-from PIL import Image
 from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor
 
 def read_video_pyav(container, indices):
@@ -79,36 +86,99 @@ def read_video_pyav(container, indices):
             frames.append(frame)
     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 
-
-model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", device_map="auto")
+# Load the model in half-precision
+model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, device_map="auto")
 processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
 
+# Load the video as an np.arrau, sampling uniformly 8 frames
 video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
-
 container = av.open(video_path)
 total_frames = container.streams.video[0].frames
 indices = np.arange(0, total_frames, total_frames / 8).astype(int)
 video = read_video_pyav(container, indices)
 
+# For better results, we recommend to prompt the model in the following format
 prompt = "USER: <video>Why is this funny? ASSISTANT:"
 inputs = processor(text=prompt, videos=video, return_tensors="pt")
 
-out = model.generate(**inputs, max_new_tokens=40)
-print(processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))
+out = model.generate(**inputs, max_new_tokens=60)
+processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 ```
 
-For multiple turns conversation change the prompt to:
+For multiple turns conversation change the prompt format to:
 
 ```bash
 "USER: <video>What do you see in this video? ASSISTANT: A baby reading a book. USER: Why is the it funny? ASSISTANT:"
 ```
 
-- Note that the video inputs should have exactly 8 frames at the input, since the models were trained in that setting.
+### Mixed Media Mode
+
+The model can also generate from an interleaved image-video inputs. However note, that it was not trained in interleaved image-video setting which might affect the performance. Below is an example usage for mixed media input, add the following lines to the above code snippet: 
+
+```python
+from PIL import Image
+import requests
+
+# Generate from image and video mixed inputs
+# Load and image and write a new prompt
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+prompt = "USER: <image> How many cats are there in the image? ASSISTANT: There are two cats. USER: <video>Why is this video funny? ASSISTANT:"
+
+inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
+
+# Generate
+generate_ids = model.generate(**inputs, max_length=50)
+processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+
+```
+
+## Model optimization
+
+### Quantization using Bitsandbytes for memory efficiency
+
+The model can be loaded in lower bits, significantly reducing memory burden while maintaining the performance of the original model. his allows for efficient deployment on resource-constrained cases. 
+
+First make sure to install bitsandbytes by running `pip install bitsandbytes` and to have access to a CUDA compatible GPU device. Load the quantized model by simply adding [`BitsAndBytesConfig`](../main_classes/quantization#transformers.BitsAndBytesConfig) as shown below:
 
 
+```python
+from transformers import VideoLlavaForConditionalGeneration, BitsAndBytesConfig
 
-This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
-The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA).
+# specify how to quantize the model
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+
+model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", quantization_config=quantization_config, device_map="auto")
+```
+
+
+### Flash-Attention 2 to speed-up generation
+
+Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
+
+First, make sure to install the latest version of Flash Attention 2:
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Also, you should have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of the [flash attention repository](https://github.com/Dao-AILab/flash-attention). FlashAttention-2 can only be used when a model is loaded in `torch.float16` or `torch.bfloat16`.
+
+To load and run a model using Flash Attention-2, simply add `attn_implementation="flash_attention_2"` when loading the model as follows:
+
+```python
+from transformers import VideoLlavaForConditionalGeneration
+
+model = VideoLlavaForConditionalGeneration.from_pretrained(
+    "LanguageBind/Video-LLaVA-7B-hf", 
+    torch_dtype=torch.float16, 
+    attn_implementation="flash_attention_2",
+).to(0)
+```
 
 
 ## VideoLlavaConfig
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 1059479334..19829d3347 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -123,6 +123,7 @@ class VideoLlavaPreTrainedModel(PreTrainedModel):
     config_class = VideoLlavaConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["VideoLlavaVisionAttention"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _no_split_modules = ["VideoLlavaVisionAttention"]
@@ -474,22 +475,23 @@ class VideoLlavaForConditionalGeneration(VideoLlavaPreTrainedModel):
         >>> # Generate
         >>> generate_ids = model.generate(**inputs, max_length=80)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        'USER:  Why is this video funny? ASSISTANT: The video is funny because the baby is sitting on the bed and reading a book, which is an unusual and amusing sight.Ъ'
+        "USER:  Why is this video funny? ASSISTANT: The video is funny because the baby is playing with a Wii remote while sitting on the floor, and the baby is wearing glasses.Ъ. The baby's actions are amusing because it is a young child trying to interact with a video game, which is not a typical activity for a"
 
         >>> # to generate from image and video mix
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> prompt = [
-                "USER: <image> How many cats are there in the image? ASSISTANT:",
-                "USER: <video>Why is this video funny? ASSISTANT:"
-            ]
+        ...     "USER: <image> How many cats do you see? ASSISTANT:",
+        ...     "USER: <video>Why is this video funny? ASSISTANT:"
+        ... ]
         >>> inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
 
         >>> # Generate
         >>> generate_ids = model.generate(**inputs, max_length=50)
         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        ['USER:   How many cats are there in the image? ASSISTANT: There are two cats in the image.\nHow many cats are sleeping on the couch?\nThere are', 'USER:  Why is this video funny? ASSISTANT: The video is funny because the baby is sitting on the bed and reading a book, which is an unusual and amusing']
-        ```"""
+        ['USER:   How many cats do you see? ASSISTANT: There are two cats visible in the image. (or three, if you count the one in the background).', 'USER:  Why is this video funny? ASSISTANT: The video is funny because it shows a baby sitting on a bed and playing with a Wii remote.Ъ. The baby is holding the remote']
+        ```
+        """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

From 5c186003b830ed71dde9897a2acd5fb06fcc53aa Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Wed, 22 May 2024 14:09:01 +0200
Subject: [PATCH 32/80] Fix low cpu mem usage tests (#30808)

* Fix tests

* fix udop failing test

* remove skip

* style
---
 src/transformers/models/udop/modeling_udop.py |  2 +-
 tests/test_modeling_common.py                 | 15 +++++----------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py
index 02e7aac88b..4b170c1023 100644
--- a/src/transformers/models/udop/modeling_udop.py
+++ b/src/transformers/models/udop/modeling_udop.py
@@ -1297,7 +1297,7 @@ class UdopStack(UdopPreTrainedModel):
         # get weights from encoder position bias
         self.relative_bias = self._get_relative_bias(config)
 
-        # tie weights of original position bias of encoder
+    def _tie_weights(self):
         for bias in self.relative_bias.biases:
             if isinstance(bias, RelativePositionBias1D):
                 self._tie_or_clone_weights(
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 5f3ac898da..5480105054 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -21,7 +21,6 @@ import os.path
 import random
 import re
 import tempfile
-import unittest
 import warnings
 from collections import defaultdict
 from typing import Dict, List, Tuple
@@ -444,7 +443,6 @@ class ModelTesterMixin:
     @slow
     @require_accelerate
     @mark.accelerate_tests
-    @unittest.skip("Need to fix since we have a device mismatch")
     def test_save_load_low_cpu_mem_usage(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         with tempfile.TemporaryDirectory() as saved_model_path:
@@ -457,7 +455,6 @@ class ModelTesterMixin:
     @slow
     @require_accelerate
     @mark.accelerate_tests
-    @unittest.skip("Need to fix since we have a device mismatch")
     def test_save_load_low_cpu_mem_usage_checkpoints(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         with tempfile.TemporaryDirectory() as saved_model_path:
@@ -471,7 +468,6 @@ class ModelTesterMixin:
     @slow
     @require_accelerate
     @mark.accelerate_tests
-    @unittest.skip("Need to fix since we have a device mismatch")
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         with tempfile.TemporaryDirectory() as saved_model_path:
             for model_class in self.all_model_classes:
@@ -482,6 +478,8 @@ class ModelTesterMixin:
                 self._check_save_load_low_cpu_mem_usage(model_class, saved_model_path)
 
     def _check_save_load_low_cpu_mem_usage(self, model_class, saved_model_path):
+        from accelerate.utils.modeling import named_module_tensors
+
         # Load the low usage and the normal models.
         model_low_usage, loading_info = model_class.from_pretrained(
             saved_model_path,
@@ -496,16 +494,13 @@ class ModelTesterMixin:
         # The low_cpu_mem_usage=True causes the model params to be initialized with device=meta, and then
         # subsequently loaded with the correct values and onto the correct device. We check if there are any
         # remaining params that were not properly loaded.
-        for name, param in model_low_usage.named_parameters():
+        for name, tensor in named_module_tensors(model_low_usage, recurse=True):
             self.assertNotEqual(
-                param.device,
+                tensor.device,
                 torch.device("meta"),
-                "Parameter '" + name + "' has not been properly loaded and has device=meta.",
+                "Tensor '" + name + "' has not been properly loaded and has device=meta.",
             )
 
-        # Tests moving the model to a device other than meta.
-        model_low_usage.to(torch_device)
-
         # Check that the parameters are equal.
         for p1, p2 in zip(model_low_usage.parameters(), model_non_low_usage.parameters()):
             self.assertEquals(p1.data.ne(p2.data).sum(), 0)

From 24d2a5e1a391173ec94f569a25e7016d560900f0 Mon Sep 17 00:00:00 2001
From: Vaibhav Srivastav <vaibhavs10@gmail.com>
Date: Wed, 22 May 2024 15:06:09 +0200
Subject: [PATCH 33/80] [doc] Add references to the fine-tuning blog and
 distil-whisper to Whisper. (#30938)

[doc] Add references to the fine-tuning blog and distil-whisper to Whisper doc.
---
 docs/source/en/model_doc/whisper.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md
index 138f2b374b..992ff71735 100644
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@@ -78,6 +78,8 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Whisper. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
 
+- [Fine-tune Whisper](https://huggingface.co/blog/fine-tune-whisper) on your own dataset for better downstream performance.
+- [Distil-Whisper](https://huggingface.co/distil-whisper): Upto 6x faster, 2x smaller distilled Whisper models for English. We release the [model checkpoints](https://huggingface.co/distil-whisper), and [distillation code](https://github.com/huggingface/distil-whisper).
 - A fork with a script to [convert a Whisper model in Hugging Face format to OpenAI format](https://github.com/zuazo-forks/transformers/blob/convert_hf_to_openai/src/transformers/models/whisper/convert_hf_to_openai.py). 🌎
 Usage example:
 ```bash

From 1518508467d96b3866fc4ebcb7a5b3a2e0df2aa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonatan=20K=C5=82osko?= <jonatanklosko@gmail.com>
Date: Wed, 22 May 2024 15:07:51 +0200
Subject: [PATCH 34/80] Avoid extra chunk in speech recognition (#29539)

---
 .../pipelines/automatic_speech_recognition.py             | 3 +--
 .../test_pipelines_automatic_speech_recognition.py        | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 123dbcdb67..2e8682d96a 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -67,8 +67,7 @@ def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right,
         if dtype is not None:
             processed = processed.to(dtype=dtype)
         _stride_left = 0 if chunk_start_idx == 0 else stride_left
-        # all right strides must be full, otherwise it is the last item
-        is_last = chunk_end_idx > inputs_len if stride_right > 0 else chunk_end_idx >= inputs_len
+        is_last = chunk_end_idx >= inputs_len
         _stride_right = 0 if is_last else stride_right
 
         chunk_len = chunk.shape[0]
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index a1ab294783..bc619769e1 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1569,10 +1569,10 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
             "input_values"
         ]
         outs = list(chunk_iter(inputs, feature_extractor, 100, 20, 10))
-        self.assertEqual(len(outs), 2)
-        self.assertEqual([o["stride"] for o in outs], [(100, 0, 10), (30, 20, 0)])
-        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100), (1, 30)])
-        self.assertEqual([o["is_last"] for o in outs], [False, True])
+        self.assertEqual(len(outs), 1)
+        self.assertEqual([o["stride"] for o in outs], [(100, 0, 0)])
+        self.assertEqual([o["input_values"].shape for o in outs], [(1, 100)])
+        self.assertEqual([o["is_last"] for o in outs], [True])
 
         outs = list(chunk_iter(inputs, feature_extractor, 80, 20, 10))
         self.assertEqual(len(outs), 2)

From ada86f973c0ed4b5a96089ed3bdad2aef13f5a3d Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Wed, 22 May 2024 15:06:51 +0100
Subject: [PATCH 35/80] [whisper] only trigger forced ids warning once (#30966)

---
 src/transformers/models/whisper/generation_whisper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index 2bdff6e534..a42d7b7dec 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -1133,12 +1133,12 @@ class WhisperGenerationMixin:
             forced_decoder_ids = config.forced_decoder_ids
 
         if forced_decoder_ids is not None and task is not None:
-            logger.info(
+            logger.warning_once(
                 f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}."
             )
             forced_decoder_ids = None
         elif forced_decoder_ids is not None and language is not None:
-            logger.info(
+            logger.warning_once(
                 f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}."
             )
             forced_decoder_ids = None

From 250ae9f7464631fa22b6a483bf7a78e0b56384d6 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
Date: Wed, 22 May 2024 16:20:07 +0200
Subject: [PATCH 36/80] Paligemma - fix slow tests, add bf16 and f16 slow tests
 (#30851)

* fix slow tests, add bf16 and f16 slow tests

* few fixes

* [run-slow]paligemma

* add gate decorator

* [run-slow]paligemma

* add missing gating

* [run-slow]paligemma

* [run-slow]paligemma
---
 .../paligemma/test_modeling_paligemma.py      | 180 ++++++++++--------
 1 file changed, 104 insertions(+), 76 deletions(-)

diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index ed8d36debe..a5f38842cc 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -28,7 +28,7 @@ from transformers import (
     is_vision_available,
 )
 from transformers.testing_utils import (
-    require_bitsandbytes,
+    require_read_token,
     require_torch,
     require_torch_sdpa,
     slow,
@@ -260,60 +260,32 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
 
 @slow
 @require_torch
+@require_read_token
 class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = PaliGemmaProcessor.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
+        self.processor = PaliGemmaProcessor.from_pretrained("google/paligemma-3b-pt-224")
 
     def tearDown(self):
         gc.collect()
         torch.cuda.empty_cache()
 
     @slow
-    @require_bitsandbytes
+    @require_read_token
     def test_small_model_integration_test(self):
         # Let' s make sure we test the preprocessing to replace what is used
-        model = PaliGemmaForConditionalGeneration.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
         prompt = ""
         image_file = (
             "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
         )
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
         inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt")
-        # fmt: off
-        EXPECTED_INPUT_IDS = torch.tensor([[256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000, 256000,
-         256000, 256000, 256000, 256000,      2,    108]])
-        # fmt: on
+        EXPECTED_INPUT_IDS = torch.tensor([[257152] * 256 + [2, 108]])
         self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
 
         output = model.generate(**inputs, max_new_tokens=20)
-        EXPECTED_DECODED_TEXT = "\ncow standing on the beach"  # fmt: skip
+        EXPECTED_DECODED_TEXT = "\ncow on the beach"  # fmt: skip
 
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
@@ -321,64 +293,56 @@ class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
         )
 
     @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_paligemma(self):
+    @require_read_token
+    def test_small_model_integration_test_paligemma_VQA(self):
         # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "gv-hf/PaliGemma-test-224px-hf"
-
-        model = PaliGemmaForConditionalGeneration.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
-        processor = PaliGemmaProcessor.from_pretrained(model_id)
-
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
         prompt = "answer en Where is the cow standing?"
         image_file = (
             "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
         )
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
+        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
 
         output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
         EXPECTED_DECODED_TEXT = "answer en Where is the cow standing?\nbeach"  # fmt: skip
 
         self.assertEqual(
-            processor.decode(output[0], skip_special_tokens=True),
+            self.processor.decode(output[0], skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
         )
 
     @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_paligemma_batched(self):
+    @require_read_token
+    def test_small_model_integration_test_paligemma_empty_prompt(self):
         # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "gv-hf/PaliGemma-test-224px-hf"
-
+        model_id = "google/paligemma-3b-pt-224"
         model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
-        processor = PaliGemmaProcessor.from_pretrained(model_id)
 
-        prompts = [
-            "answer en Where is the cow standing?",
-            "",
-        ]
-        image1 = Image.open(
-            requests.get(
-                "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
-                stream=True,
-            ).raw
+        prompt = ""
+        image_file = (
+            "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png"
         )
-        image2 = image1
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = self.processor(text=prompt, images=raw_image, return_tensors="pt").to(torch.float16)
 
-        inputs = processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
+        EXPECTED_DECODED_TEXT = "\ncow on the beach"  # fmt: skip
 
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow standing on the beach"]  # fmt: skip
-
-        self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
 
     @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_batch(self):
+    @require_read_token
+    def test_small_model_integration_test_paligemma_batched(self):
         # Let' s make sure we test the preprocessing to replace what is used
-        model = PaliGemmaForConditionalGeneration.from_pretrained("gv-hf/PaliGemma-test-224px-hf")
-        # The first batch is longer in terms of text, the second will be padded.
+        model_id = "google/paligemma-3b-pt-224"
+
+        model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
+
         prompts = [
             "answer en Where is the cow standing?",
             "",
@@ -395,20 +359,84 @@ class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
 
         output = model.generate(**inputs, max_new_tokens=20)
 
-        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow standing on the beach"]  # fmt: skip
+        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"]  # fmt: skip
+
         self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
 
     @slow
-    @require_bitsandbytes
+    @require_torch
+    @require_read_token
+    def test_small_model_integration_test_paligemma_batched_bf16(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(
+            model_id, revision="bfloat16", torch_dtype=torch.bfloat16
+        ).to(torch_device)
+        # The first batch is longer in terms of text, the second will be padded.
+        prompts = [
+            "answer en Where is the cow standing?",
+            "",
+        ]
+        image1 = Image.open(
+            requests.get(
+                "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                stream=True,
+            ).raw
+        )
+        image2 = image1
+
+        inputs = (
+            self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+            .to(torch.bfloat16)
+            .to(torch_device)
+        )
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"]  # fmt: skip
+        self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
+
+    @slow
+    @require_torch
+    @require_read_token
+    def test_small_model_integration_test_paligemma_batched_f16(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "google/paligemma-3b-pt-224"
+        model = PaliGemmaForConditionalGeneration.from_pretrained(
+            model_id, revision="float16", torch_dtype=torch.float16
+        ).to(torch_device)
+        # The first batch is longer in terms of text, the second will be padded.
+        prompts = [
+            "answer en Where is the cow standing?",
+            "",
+        ]
+        image1 = Image.open(
+            requests.get(
+                "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/cow_beach_1.png",
+                stream=True,
+            ).raw
+        )
+        image2 = image1
+
+        inputs = (
+            self.processor(text=prompts, images=[image1, image2], return_tensors="pt", padding=True)
+            .to(torch.float16)
+            .to(torch_device)
+        )
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ["answer en Where is the cow standing?\nbeach", "\ncow on the beach"]  # fmt: skip
+        self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
+
+    @slow
+    @require_read_token
     def test_paligemma_index_error_bug(self):
         # This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore
         # Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
         # more details
-        model_id = "gv-hf/PaliGemma-test-224px-hf"
+        model_id = "google/paligemma-3b-pt-224"
         model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
 
-        processor = PaliGemmaProcessor.from_pretrained(model_id)
-
         # Simulate a super long prompt
         prompt = "\n" * 200
         image_file = (
@@ -416,7 +444,7 @@ class PaliGemmaForConditionalGenerationIntegrationTest(unittest.TestCase):
         )
 
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(
+        inputs = self.processor(
             text=prompt,
             images=raw_image,
             return_tensors="pt",

From dff54ad2d9308ec20a7c13d5e9e13956e9701667 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Wed, 22 May 2024 15:23:04 +0100
Subject: [PATCH 37/80] =?UTF-8?q?=F0=9F=9A=A8=20out=5Findices=20always=20a?=
 =?UTF-8?q?=20list=20(#30941)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* out_indices always a list

* Update src/transformers/utils/backbone_utils.py

* Update src/transformers/utils/backbone_utils.py

* Move type casting

* nit
---
 src/transformers/utils/backbone_utils.py      | 12 +++++---
 .../test_modeling_timm_backbone.py            |  2 +-
 tests/utils/test_backbone_utils.py            | 29 ++++++++++---------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 6f72494ad8..8a265375b6 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -47,8 +47,8 @@ def verify_out_features_out_indices(
             )
 
     if out_indices is not None:
-        if not isinstance(out_indices, (list, tuple)):
-            raise ValueError(f"out_indices must be a list or tuple, got {type(out_indices)}")
+        if not isinstance(out_indices, list):
+            raise ValueError(f"out_indices must be a list, got {type(out_indices)}")
         # Convert negative indices to their positive equivalent: [-1,] -> [len(stage_names) - 1,]
         positive_indices = tuple(idx % len(stage_names) if idx < 0 else idx for idx in out_indices)
         if any(idx for idx in positive_indices if idx not in range(len(stage_names))):
@@ -58,7 +58,7 @@ def verify_out_features_out_indices(
             msg += f"(equivalent to {positive_indices}))" if positive_indices != out_indices else ""
             raise ValueError(msg)
         if positive_indices != tuple(sorted(positive_indices)):
-            sorted_negative = tuple(idx for _, idx in sorted(zip(positive_indices, out_indices), key=lambda x: x[0]))
+            sorted_negative = [idx for _, idx in sorted(zip(positive_indices, out_indices), key=lambda x: x[0])]
             raise ValueError(
                 f"out_indices must be in the same order as stage_names, expected {sorted_negative} got {out_indices}"
             )
@@ -122,6 +122,7 @@ def get_aligned_output_features_output_indices(
         out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
         stage_names (`List[str]`): The names of the stages of the backbone.
     """
+    out_indices = list(out_indices) if out_indices is not None else None
     # First verify that the out_features and out_indices are valid
     verify_out_features_out_indices(out_features=out_features, out_indices=out_indices, stage_names=stage_names)
     output_features, output_indices = _align_output_features_output_indices(
@@ -147,7 +148,10 @@ class BackboneMixin:
         # the timm model has out_features = ['act', 'layer1', 'layer2', 'layer3', 'layer4']
         self.stage_names = [stage["module"] for stage in self._backbone.feature_info.info]
         self.num_features = [stage["num_chs"] for stage in self._backbone.feature_info.info]
-        out_indices = self._backbone.feature_info.out_indices
+
+        # In some timm versions, out_indices reflects the input type of out_indices on the `create_model` call,
+        # in later versions >= 1, it is always a tuple
+        out_indices = list(self._backbone.feature_info.out_indices)
         out_features = self._backbone.feature_info.module_name()
 
         # We verify the out indices and out features are valid
diff --git a/tests/models/timm_backbone/test_modeling_timm_backbone.py b/tests/models/timm_backbone/test_modeling_timm_backbone.py
index 1cd04cd484..3926ae983b 100644
--- a/tests/models/timm_backbone/test_modeling_timm_backbone.py
+++ b/tests/models/timm_backbone/test_modeling_timm_backbone.py
@@ -131,7 +131,7 @@ class TimmBackboneModelTest(ModelTesterMixin, BackboneTesterMixin, PipelineTeste
         # Out indices are set to the last layer by default. For timm models, we don't know
         # the number of layers in advance, so we set it to (-1,), whereas for transformers
         # models, we set it to [len(stage_names) - 1] (kept for backward compatibility).
-        self.assertEqual(timm_model.out_indices, (-1,))
+        self.assertEqual(timm_model.out_indices, [-1])
         self.assertEqual(transformers_model.out_indices, [len(timm_model.stage_names) - 1])
 
         timm_model = AutoBackbone.from_pretrained(timm_checkpoint, use_timm_backbone=True, out_indices=[1, 2, 3])
diff --git a/tests/utils/test_backbone_utils.py b/tests/utils/test_backbone_utils.py
index cd9a5a29a8..2e1242e420 100644
--- a/tests/utils/test_backbone_utils.py
+++ b/tests/utils/test_backbone_utils.py
@@ -70,52 +70,55 @@ class BackboneUtilsTester(unittest.TestCase):
         with pytest.raises(
             ValueError, match=r"out_features must be a subset of stage_names: \['a'\] got \['a', 'b'\]"
         ):
-            verify_out_features_out_indices(["a", "b"], (0, 1), ["a"])
+            verify_out_features_out_indices(["a", "b"], [0, 1], ["a"])
 
         # Out features must contain no duplicates
         with pytest.raises(ValueError, match=r"out_features must not contain any duplicates, got \['a', 'a'\]"):
             verify_out_features_out_indices(["a", "a"], None, ["a"])
 
-        # Out indices must be a list or tuple
-        with pytest.raises(ValueError, match="out_indices must be a list or tuple, got <class 'int'>"):
+        # Out indices must be a list
+        with pytest.raises(ValueError, match="out_indices must be a list, got <class 'int'>"):
             verify_out_features_out_indices(None, 0, ["a", "b"])
 
+        with pytest.raises(ValueError, match="out_indices must be a list, got <class 'tuple'>"):
+            verify_out_features_out_indices(None, (0, 1), ["a", "b"])
+
         # Out indices must be a subset of stage names
         with pytest.raises(
-            ValueError, match=r"out_indices must be valid indices for stage_names \['a'\], got \(0, 1\)"
+            ValueError, match=r"out_indices must be valid indices for stage_names \['a'\], got \[0, 1\]"
         ):
-            verify_out_features_out_indices(None, (0, 1), ["a"])
+            verify_out_features_out_indices(None, [0, 1], ["a"])
 
         # Out indices must contain no duplicates
-        with pytest.raises(ValueError, match=r"out_indices must not contain any duplicates, got \(0, 0\)"):
-            verify_out_features_out_indices(None, (0, 0), ["a"])
+        with pytest.raises(ValueError, match=r"out_indices must not contain any duplicates, got \[0, 0\]"):
+            verify_out_features_out_indices(None, [0, 0], ["a"])
 
         # Out features and out indices must be the same length
         with pytest.raises(
             ValueError, match="out_features and out_indices should have the same length if both are set"
         ):
-            verify_out_features_out_indices(["a", "b"], (0,), ["a", "b", "c"])
+            verify_out_features_out_indices(["a", "b"], [0], ["a", "b", "c"])
 
         # Out features should match out indices
         with pytest.raises(
             ValueError, match="out_features and out_indices should correspond to the same stages if both are set"
         ):
-            verify_out_features_out_indices(["a", "b"], (0, 2), ["a", "b", "c"])
+            verify_out_features_out_indices(["a", "b"], [0, 2], ["a", "b", "c"])
 
         # Out features and out indices should be in order
         with pytest.raises(
             ValueError,
             match=r"out_features must be in the same order as stage_names, expected \['a', 'b'\] got \['b', 'a'\]",
         ):
-            verify_out_features_out_indices(["b", "a"], (0, 1), ["a", "b"])
+            verify_out_features_out_indices(["b", "a"], [0, 1], ["a", "b"])
 
         with pytest.raises(
-            ValueError, match=r"out_indices must be in the same order as stage_names, expected \(-2, 1\) got \(1, -2\)"
+            ValueError, match=r"out_indices must be in the same order as stage_names, expected \[-2, 1\] got \[1, -2\]"
         ):
-            verify_out_features_out_indices(["a", "b"], (1, -2), ["a", "b"])
+            verify_out_features_out_indices(["a", "b"], [1, -2], ["a", "b"])
 
         # Check passes with valid inputs
-        verify_out_features_out_indices(["a", "b", "d"], (0, 1, -1), ["a", "b", "c", "d"])
+        verify_out_features_out_indices(["a", "b", "d"], [0, 1, -1], ["a", "b", "c", "d"])
 
     def test_backbone_mixin(self):
         backbone = BackboneMixin()

From 1432f641b82572b7323b75fbdb102cbc25873989 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Wed, 22 May 2024 17:48:26 +0200
Subject: [PATCH 38/80] Finally fix the missing new model failure CI report
 (#30968)

fix

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/notification_service.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/utils/notification_service.py b/utils/notification_service.py
index cf126cd68a..83b3244f92 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -1164,15 +1164,16 @@ if __name__ == "__main__":
             json.dump(job_result, fp, indent=4, ensure_ascii=False)
 
     prev_ci_artifacts = None
-    target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
-    if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
-        # Get the last previously completed CI's failure tables
-        artifact_names = [f"ci_results_{job_name}"]
-        output_dir = os.path.join(os.getcwd(), "previous_reports")
-        os.makedirs(output_dir, exist_ok=True)
-        prev_ci_artifacts = get_last_daily_ci_reports(
-            artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
-        )
+    if job_name == "run_models_gpu":
+        target_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml@refs/heads/main"
+        if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
+            # Get the last previously completed CI's failure tables
+            artifact_names = [f"ci_results_{job_name}"]
+            output_dir = os.path.join(os.getcwd(), "previous_reports")
+            os.makedirs(output_dir, exist_ok=True)
+            prev_ci_artifacts = get_last_daily_ci_reports(
+                artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
+            )
 
     message = Message(
         title,

From 1d568dfab262f76079eb4f3d05b606d51a0c9e4b Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 22 May 2024 18:06:50 +0200
Subject: [PATCH 39/80] legacy to init the slow tokenizer when converting from
 slow was wrong (#30972)

---
 src/transformers/models/llama/tokenization_llama_fast.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index 4a7ef126d4..580290841c 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -151,9 +151,6 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
         self.legacy = legacy
 
         if add_prefix_space is not None:
-            logger.warning_once(
-                "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
-            )
             kwargs["from_slow"] = True
 
         super().__init__(
@@ -166,6 +163,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
             use_default_system_prompt=use_default_system_prompt,
+            legacy=legacy,
             **kwargs,
         )
         self._add_bos_token = add_bos_token

From b1065aa08ac0da11fcb9e3827cd7eafabe4beebd Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Wed, 22 May 2024 21:15:41 +0500
Subject: [PATCH 40/80] Generation: get special tokens from model config
 (#30899)

* fix

* let's do this way?

* codestyle

* update

* add tests
---
 src/transformers/generation/utils.py | 24 +++++++++++++++++++++-
 tests/generation/test_utils.py       | 30 ++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index bfaa665979..ab8e601906 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1361,6 +1361,23 @@ class GenerationMixin:
             self._cache.reset()
         return self._cache
 
+    def _get_decoder_start_token_id(
+        self, decoder_start_token_id: Union[int, List[int]] = None, bos_token_id: int = None
+    ) -> int:
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else self.generation_config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.generation_config.bos_token_id
+
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        else:
+            return
+
     def _prepare_special_tokens(
         self,
         generation_config: GenerationConfig,
@@ -1385,11 +1402,16 @@ class GenerationMixin:
                 return token
             return torch.tensor(token, device=device, dtype=torch.long)
 
+        # for BC we also try to get `decoder_start_token_id` from model's generation config (#30892)
+        if self.config.is_encoder_decoder:
+            generation_config.decoder_start_token_id = self._get_decoder_start_token_id(
+                generation_config.decoder_start_token_id, generation_config.bos_token_id
+            )
+
         bos_token_id = _tensor_or_none(generation_config.bos_token_id, device=device)
         eos_token_id = _tensor_or_none(generation_config.eos_token_id, device=device)
         pad_token_id = _tensor_or_none(generation_config.pad_token_id, device=device)
         decoder_start_token_id = _tensor_or_none(generation_config.decoder_start_token_id, device=device)
-        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else bos_token_id
 
         # We can have more than one eos token. Always treat it as a 1D tensor (when it exists).
         if eos_token_id is not None and eos_token_id.ndim == 0:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index cf703d8a22..b8e90a5b8e 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -65,6 +65,7 @@ if is_torch_available():
         GenerateBeamEncoderDecoderOutput,
         GenerateDecoderOnlyOutput,
         GenerateEncoderDecoderOutput,
+        GenerationConfig,
         GreedySearchDecoderOnlyOutput,
         GreedySearchEncoderDecoderOutput,
         LogitsProcessorList,
@@ -2478,6 +2479,35 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi
 
         self.assertListEqual(outputs.tolist(), outputs_batched_ids.tolist())
 
+    def test_decoder_start_id_from_config(self):
+        # Refer to: (#30899)
+        articles = [
+            "Justin Timberlake and Jessica Biel, welcome to parenthood.",
+            "Michael Phelps is arguably the most decorated Olympian of all time.",
+        ]
+        bart_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bart")
+        bart_model = BartForConditionalGeneration.from_pretrained("hf-internal-testing/tiny-random-bart").to(
+            torch_device
+        )
+        input_ids = bart_tokenizer(articles, return_tensors="pt", padding=True).input_ids.to(torch_device)
+        decoder_start_token_id = bart_model.generation_config.decoder_start_token_id
+
+        # we should be able to take `decoder_start_token_id` from model's generation config if user passes a `GenerationConfig` type
+        outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
+
+        # If the generatoin config has no `decoder_start_token_id` or `bos_token_id`, we will raise an error unless user passes it in config
+        bart_model.generation_config.decoder_start_token_id = None
+        bart_model.generation_config.bos_token_id = None
+        outputs_with_user_id = bart_model.generate(
+            input_ids,
+            generation_config=GenerationConfig(do_sample=False, decoder_start_token_id=decoder_start_token_id),
+        )
+
+        self.assertListEqual(outputs.tolist(), outputs_with_user_id.tolist())
+
+        with self.assertRaises(ValueError):
+            outputs = bart_model.generate(input_ids, generation_config=GenerationConfig(do_sample=False))
+
     def test_contrastive_search_batched(self):
         # PT-only test: TF doesn't have constrained beam search
         # Tests that contrastive search works with batched inputs (i.e. has the same output as for non-batched inputs)

From 0948c827de87056696032f1b84bad309259b0eb7 Mon Sep 17 00:00:00 2001
From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Date: Wed, 22 May 2024 17:25:47 +0100
Subject: [PATCH 41/80] [Whisper] Strip prompt before finding common
 subsequence (#27836)

---
 .../models/whisper/tokenization_whisper.py    |  4 +++
 ..._pipelines_automatic_speech_recognition.py | 36 +++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 5f7f0ae53d..303822de65 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -897,11 +897,15 @@ def _decode_asr(tokenizer, model_outputs, *, return_timestamps, return_language,
     right_stride_start = None
 
     all_special_ids = set(tokenizer.all_special_ids)
+    prompt_token_id = tokenizer.convert_tokens_to_ids("<|startofprev|>")
+    decoder_start_token_id = tokenizer.convert_tokens_to_ids("<|startoftranscript|>")
     # - iterate over all outputs
     for chunk_id, output in enumerate(model_outputs):
         # We can drop everything to Python list, it's going to make
         # our lives easier
         token_ids = output["tokens"][0].tolist()
+        # (possibly) remove the prompt from the token ids
+        token_ids = tokenizer._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
         if return_timestamps == "word":
             token_timestamps = output["token_timestamps"][0].tolist()
 
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index bc619769e1..5ab18e81d5 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -1343,6 +1343,42 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
         self.assertEqual(output, [{"text": ANY(str)}])
         self.assertEqual(output[0]["text"][:6], "<s> <s")
 
+    @require_torch
+    @slow
+    def test_whisper_prompted(self):
+        processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model = model.to("cuda")
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            max_new_tokens=128,
+            chunk_length_s=30,
+            batch_size=16,
+            device="cuda:0",
+        )
+
+        dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
+        sample = dataset[0]["audio"]
+
+        # prompt the model to misspell "Mr Quilter" as "Mr Quillter"
+        whisper_prompt = "Mr. Quillter."
+        prompt_ids = pipe.tokenizer.get_prompt_ids(whisper_prompt, return_tensors="pt")
+
+        unprompted_result = pipe(sample.copy())["text"]
+        prompted_result = pipe(sample, generate_kwargs={"prompt_ids": prompt_ids})["text"]
+
+        # fmt: off
+        EXPECTED_UNPROMPTED_RESULT = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all and can discover in it but little of rocky Ithaca. Lennils, pictures are a sort of upguards and atom paintings and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampoo or a Turkish bath. Next man"
+        EXPECTED_PROMPTED_RESULT = " Mr. Quillter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quillter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really great after all, and can discover in it but little of rocky Ithaca. Lennils, pictures are a sort of upguards and atom paintings, and Mason's exquisite itals are as national as a jingo poem. Mr. Birkut Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap on the back before he says like a shampoo or a Turkish bath. Next man."
+        # fmt: on
+
+        self.assertEqual(unprompted_result, EXPECTED_UNPROMPTED_RESULT)
+        self.assertEqual(prompted_result, EXPECTED_PROMPTED_RESULT)
+
     @require_torch
     @slow
     def test_whisper_longform(self):

From d44e1ae03646617b550ff628a706ffd8fe093767 Mon Sep 17 00:00:00 2001
From: Jun <jun.thedev@gmail.com>
Date: Wed, 22 May 2024 12:39:46 -0400
Subject: [PATCH 42/80] Fix link in Pipeline documentation (#30948)

fix documentation as suggested by stevhliu

Co-authored-by: Jun <jun@reliant.ai>
---
 src/transformers/pipelines/conversational.py       | 2 +-
 src/transformers/pipelines/text2text_generation.py | 6 +++---
 src/transformers/pipelines/text_generation.py      | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/pipelines/conversational.py b/src/transformers/pipelines/conversational.py
index 257f693c9d..9b0c438735 100644
--- a/src/transformers/pipelines/conversational.py
+++ b/src/transformers/pipelines/conversational.py
@@ -270,7 +270,7 @@ class ConversationalPipeline(Pipeline):
                 Whether or not to clean up the potential extra spaces in the text output.
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework [here](./model#generative-models)).
+                corresponding to your framework [here](./main_classes/text_generation)).
 
         Returns:
             [`Conversation`] or a list of [`Conversation`]: Conversation(s) with updated generated responses for those
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index bb8abdfcf7..67f8b8963c 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -154,7 +154,7 @@ class Text2TextGenerationPipeline(Pipeline):
                 max_length instead of throwing an error down the line.
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework [here](./model#generative-models)).
+                corresponding to your framework [here](./main_classes/text_generation)).
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
@@ -257,7 +257,7 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
                 Whether or not to clean up the potential extra spaces in the text output.
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework [here](./model#generative-models)).
+                corresponding to your framework [here](./main_classes/text_generation)).
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
@@ -359,7 +359,7 @@ class TranslationPipeline(Text2TextGenerationPipeline):
                 for single pair translation models
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework [here](./model#generative-models)).
+                corresponding to your framework [here](./main_classes/text_generation)).
 
         Return:
             A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index 0c9e2f3e7a..ca8e5da6ea 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -240,7 +240,7 @@ class TextGenerationPipeline(Pipeline):
                   truncate a lot of the prompt and not suitable when generation exceed the model capacity)
             generate_kwargs (`dict`, *optional*):
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework [here](./model#generative-models)).
+                corresponding to your framework [here](./main_classes/text_generation)).
 
         Return:
             A list or a list of lists of `dict`: Returns one of the following dictionaries (cannot return a combination

From a25f7d3c12975fe21eab437dda7363e9024de7c0 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
Date: Wed, 22 May 2024 19:37:15 +0200
Subject: [PATCH 43/80] Paligemma causal attention mask (#30967)

* PaliGemma working causal attention

* Formatting

* Style

* Docstrings + remove commented code

* Update docstring for PaliGemma Config

* PaliGemma - add separator ind to model/labels

* Refactor + docstring paligemma processor method

* Style

* return token type ids when tokenizing labels

* use token type ids when building causal mask

* add token type ids to tester

* remove separator from config

* fix style

* don't ignore separator

* add processor documentation

* simplify tokenization

* fix causal mask

* style

* fix label propagation, revert suffix naming

* fix style

* fix labels tokenization

* [run-slow]paligemma

* add eos if suffixes are present

* [run-slow]paligemma

* [run-slow]paligemma

* add misssing tokens to fast version

* Apply suggestions from code review

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fix style

* [run-slow]paligemma

---------

Co-authored-by: Peter Robicheaux <peter@roboflow.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
---
 .../models/paligemma/modeling_paligemma.py    |  53 ++++++---
 .../models/paligemma/processing_paligemma.py  | 101 ++++++++++++------
 .../paligemma/test_modeling_paligemma.py      |   2 +
 3 files changed, 111 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index bc900f0de7..58938b99ee 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -283,9 +283,14 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
-    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+    def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
+    ):
         _, _, embed_dim = image_features.shape
         batch_size, sequence_length = input_ids.shape
+        dtype, device = inputs_embeds.dtype, inputs_embeds.device
+        min_dtype = torch.finfo(dtype).min
+
         scaled_image_features = image_features / (self.config.hidden_size**0.5)
         final_embedding = torch.zeros(
             batch_size, sequence_length, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
@@ -306,24 +311,43 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
             image_mask.unsqueeze(-1).expand_as(final_embedding), scaled_image_features
         )
         final_embedding = torch.where(pad_mask_expanded, torch.zeros_like(final_embedding), final_embedding)
+        if attention_mask is not None:
+            position_ids = (attention_mask.cumsum(-1)).masked_fill_((attention_mask == 0), 1)
+        else:
+            position_ids = None
 
-        final_attention_mask_4d = attention_mask.unsqueeze(1).unsqueeze(2) * attention_mask.unsqueeze(1).unsqueeze(-1)
-        final_attention_mask_4d = final_attention_mask_4d.float().expand(
-            -1, self.config.text_config.num_key_value_heads, -1, -1
-        )
+        if token_type_ids is not None and labels is not None:
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            target_length = cache_position[-1] + 1
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                # unmask the prefill
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :] == 0, 0
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
 
-        # position_ids = torch.arange(0, sequence_length, device=input_ids.device).expand(batch_size, -1)
-        # position_ids = torch.where(input_ids == self.pad_token_id, torch.ones_like(position_ids), position_ids)
-        position_ids = (attention_mask.cumsum(-1)).masked_fill_((attention_mask == 0), 1)
-
-        if labels is not None:
             final_labels = torch.full(
                 (batch_size, sequence_length), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
             )
             final_labels = torch.where(input_ids != self.pad_token_id, labels, final_labels)
         else:
+            causal_mask = attention_mask.unsqueeze(1).unsqueeze(2) * attention_mask.unsqueeze(1).unsqueeze(-1)
+            causal_mask = causal_mask.to(dtype).expand(-1, self.config.text_config.num_key_value_heads, -1, -1)
             final_labels = None
-        return final_embedding, final_attention_mask_4d, final_labels, position_ids
+        return final_embedding, causal_mask, final_labels, position_ids
 
     @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@@ -334,6 +358,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
         cache_position: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
@@ -397,8 +422,10 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
                 selected_image_feature = image_outputs.last_hidden_state
                 image_features = self.multi_modal_projector(selected_image_feature)
 
+                if cache_position is None:
+                    cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
                 inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
+                    image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
                 )
 
             else:
@@ -487,6 +514,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
         cache_position=None,
         pixel_values=None,
         attention_mask=None,
+        token_type_ids=None,
         **kwargs,
     ):
         past_length = 0
@@ -545,6 +573,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
                 "pixel_values": pixel_values,
+                "token_type_ids": token_type_ids,
             }
         )
         return model_inputs
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index f086795f43..0d47fef455 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -22,13 +22,20 @@ from typing import List, Optional, Union
 from ...feature_extraction_utils import BatchFeature
 from ...image_utils import ImageInput, is_valid_image
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import AddedToken, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...tokenization_utils_base import (
+    AddedToken,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
 from ...utils import TensorType
 
 
 logger = logging.getLogger(__name__)
 
 IMAGE_TOKEN = "<image>"
+EXTRA_TOKENS = ['<loc0000>', '<loc0001>', '<loc0002>', '<loc0003>', '<loc0004>', '<loc0005>', '<loc0006>', '<loc0007>', '<loc0008>', '<loc0009>', '<loc0010>', '<loc0011>', '<loc0012>', '<loc0013>', '<loc0014>', '<loc0015>', '<loc0016>', '<loc0017>', '<loc0018>', '<loc0019>', '<loc0020>', '<loc0021>', '<loc0022>', '<loc0023>', '<loc0024>', '<loc0025>', '<loc0026>', '<loc0027>', '<loc0028>', '<loc0029>', '<loc0030>', '<loc0031>', '<loc0032>', '<loc0033>', '<loc0034>', '<loc0035>', '<loc0036>', '<loc0037>', '<loc0038>', '<loc0039>', '<loc0040>', '<loc0041>', '<loc0042>', '<loc0043>', '<loc0044>', '<loc0045>', '<loc0046>', '<loc0047>', '<loc0048>', '<loc0049>', '<loc0050>', '<loc0051>', '<loc0052>', '<loc0053>', '<loc0054>', '<loc0055>', '<loc0056>', '<loc0057>', '<loc0058>', '<loc0059>', '<loc0060>', '<loc0061>', '<loc0062>', '<loc0063>', '<loc0064>', '<loc0065>', '<loc0066>', '<loc0067>', '<loc0068>', '<loc0069>', '<loc0070>', '<loc0071>', '<loc0072>', '<loc0073>', '<loc0074>', '<loc0075>', '<loc0076>', '<loc0077>', '<loc0078>', '<loc0079>', '<loc0080>', '<loc0081>', '<loc0082>', '<loc0083>', '<loc0084>', '<loc0085>', '<loc0086>', '<loc0087>', '<loc0088>', '<loc0089>', '<loc0090>', '<loc0091>', '<loc0092>', '<loc0093>', '<loc0094>', '<loc0095>', '<loc0096>', '<loc0097>', '<loc0098>', '<loc0099>', '<loc0100>', '<loc0101>', '<loc0102>', '<loc0103>', '<loc0104>', '<loc0105>', '<loc0106>', '<loc0107>', '<loc0108>', '<loc0109>', '<loc0110>', '<loc0111>', '<loc0112>', '<loc0113>', '<loc0114>', '<loc0115>', '<loc0116>', '<loc0117>', '<loc0118>', '<loc0119>', '<loc0120>', '<loc0121>', '<loc0122>', '<loc0123>', '<loc0124>', '<loc0125>', '<loc0126>', '<loc0127>', '<loc0128>', '<loc0129>', '<loc0130>', '<loc0131>', '<loc0132>', '<loc0133>', '<loc0134>', '<loc0135>', '<loc0136>', '<loc0137>', '<loc0138>', '<loc0139>', '<loc0140>', '<loc0141>', '<loc0142>', '<loc0143>', '<loc0144>', '<loc0145>', '<loc0146>', '<loc0147>', '<loc0148>', '<loc0149>', '<loc0150>', '<loc0151>', '<loc0152>', '<loc0153>', '<loc0154>', '<loc0155>', '<loc0156>', '<loc0157>', '<loc0158>', '<loc0159>', '<loc0160>', '<loc0161>', '<loc0162>', '<loc0163>', '<loc0164>', '<loc0165>', '<loc0166>', '<loc0167>', '<loc0168>', '<loc0169>', '<loc0170>', '<loc0171>', '<loc0172>', '<loc0173>', '<loc0174>', '<loc0175>', '<loc0176>', '<loc0177>', '<loc0178>', '<loc0179>', '<loc0180>', '<loc0181>', '<loc0182>', '<loc0183>', '<loc0184>', '<loc0185>', '<loc0186>', '<loc0187>', '<loc0188>', '<loc0189>', '<loc0190>', '<loc0191>', '<loc0192>', '<loc0193>', '<loc0194>', '<loc0195>', '<loc0196>', '<loc0197>', '<loc0198>', '<loc0199>', '<loc0200>', '<loc0201>', '<loc0202>', '<loc0203>', '<loc0204>', '<loc0205>', '<loc0206>', '<loc0207>', '<loc0208>', '<loc0209>', '<loc0210>', '<loc0211>', '<loc0212>', '<loc0213>', '<loc0214>', '<loc0215>', '<loc0216>', '<loc0217>', '<loc0218>', '<loc0219>', '<loc0220>', '<loc0221>', '<loc0222>', '<loc0223>', '<loc0224>', '<loc0225>', '<loc0226>', '<loc0227>', '<loc0228>', '<loc0229>', '<loc0230>', '<loc0231>', '<loc0232>', '<loc0233>', '<loc0234>', '<loc0235>', '<loc0236>', '<loc0237>', '<loc0238>', '<loc0239>', '<loc0240>', '<loc0241>', '<loc0242>', '<loc0243>', '<loc0244>', '<loc0245>', '<loc0246>', '<loc0247>', '<loc0248>', '<loc0249>', '<loc0250>', '<loc0251>', '<loc0252>', '<loc0253>', '<loc0254>', '<loc0255>', '<loc0256>', '<loc0257>', '<loc0258>', '<loc0259>', '<loc0260>', '<loc0261>', '<loc0262>', '<loc0263>', '<loc0264>', '<loc0265>', '<loc0266>', '<loc0267>', '<loc0268>', '<loc0269>', '<loc0270>', '<loc0271>', '<loc0272>', '<loc0273>', '<loc0274>', '<loc0275>', '<loc0276>', '<loc0277>', '<loc0278>', '<loc0279>', '<loc0280>', '<loc0281>', '<loc0282>', '<loc0283>', '<loc0284>', '<loc0285>', '<loc0286>', '<loc0287>', '<loc0288>', '<loc0289>', '<loc0290>', '<loc0291>', '<loc0292>', '<loc0293>', '<loc0294>', '<loc0295>', '<loc0296>', '<loc0297>', '<loc0298>', '<loc0299>', '<loc0300>', '<loc0301>', '<loc0302>', '<loc0303>', '<loc0304>', '<loc0305>', '<loc0306>', '<loc0307>', '<loc0308>', '<loc0309>', '<loc0310>', '<loc0311>', '<loc0312>', '<loc0313>', '<loc0314>', '<loc0315>', '<loc0316>', '<loc0317>', '<loc0318>', '<loc0319>', '<loc0320>', '<loc0321>', '<loc0322>', '<loc0323>', '<loc0324>', '<loc0325>', '<loc0326>', '<loc0327>', '<loc0328>', '<loc0329>', '<loc0330>', '<loc0331>', '<loc0332>', '<loc0333>', '<loc0334>', '<loc0335>', '<loc0336>', '<loc0337>', '<loc0338>', '<loc0339>', '<loc0340>', '<loc0341>', '<loc0342>', '<loc0343>', '<loc0344>', '<loc0345>', '<loc0346>', '<loc0347>', '<loc0348>', '<loc0349>', '<loc0350>', '<loc0351>', '<loc0352>', '<loc0353>', '<loc0354>', '<loc0355>', '<loc0356>', '<loc0357>', '<loc0358>', '<loc0359>', '<loc0360>', '<loc0361>', '<loc0362>', '<loc0363>', '<loc0364>', '<loc0365>', '<loc0366>', '<loc0367>', '<loc0368>', '<loc0369>', '<loc0370>', '<loc0371>', '<loc0372>', '<loc0373>', '<loc0374>', '<loc0375>', '<loc0376>', '<loc0377>', '<loc0378>', '<loc0379>', '<loc0380>', '<loc0381>', '<loc0382>', '<loc0383>', '<loc0384>', '<loc0385>', '<loc0386>', '<loc0387>', '<loc0388>', '<loc0389>', '<loc0390>', '<loc0391>', '<loc0392>', '<loc0393>', '<loc0394>', '<loc0395>', '<loc0396>', '<loc0397>', '<loc0398>', '<loc0399>', '<loc0400>', '<loc0401>', '<loc0402>', '<loc0403>', '<loc0404>', '<loc0405>', '<loc0406>', '<loc0407>', '<loc0408>', '<loc0409>', '<loc0410>', '<loc0411>', '<loc0412>', '<loc0413>', '<loc0414>', '<loc0415>', '<loc0416>', '<loc0417>', '<loc0418>', '<loc0419>', '<loc0420>', '<loc0421>', '<loc0422>', '<loc0423>', '<loc0424>', '<loc0425>', '<loc0426>', '<loc0427>', '<loc0428>', '<loc0429>', '<loc0430>', '<loc0431>', '<loc0432>', '<loc0433>', '<loc0434>', '<loc0435>', '<loc0436>', '<loc0437>', '<loc0438>', '<loc0439>', '<loc0440>', '<loc0441>', '<loc0442>', '<loc0443>', '<loc0444>', '<loc0445>', '<loc0446>', '<loc0447>', '<loc0448>', '<loc0449>', '<loc0450>', '<loc0451>', '<loc0452>', '<loc0453>', '<loc0454>', '<loc0455>', '<loc0456>', '<loc0457>', '<loc0458>', '<loc0459>', '<loc0460>', '<loc0461>', '<loc0462>', '<loc0463>', '<loc0464>', '<loc0465>', '<loc0466>', '<loc0467>', '<loc0468>', '<loc0469>', '<loc0470>', '<loc0471>', '<loc0472>', '<loc0473>', '<loc0474>', '<loc0475>', '<loc0476>', '<loc0477>', '<loc0478>', '<loc0479>', '<loc0480>', '<loc0481>', '<loc0482>', '<loc0483>', '<loc0484>', '<loc0485>', '<loc0486>', '<loc0487>', '<loc0488>', '<loc0489>', '<loc0490>', '<loc0491>', '<loc0492>', '<loc0493>', '<loc0494>', '<loc0495>', '<loc0496>', '<loc0497>', '<loc0498>', '<loc0499>', '<loc0500>', '<loc0501>', '<loc0502>', '<loc0503>', '<loc0504>', '<loc0505>', '<loc0506>', '<loc0507>', '<loc0508>', '<loc0509>', '<loc0510>', '<loc0511>', '<loc0512>', '<loc0513>', '<loc0514>', '<loc0515>', '<loc0516>', '<loc0517>', '<loc0518>', '<loc0519>', '<loc0520>', '<loc0521>', '<loc0522>', '<loc0523>', '<loc0524>', '<loc0525>', '<loc0526>', '<loc0527>', '<loc0528>', '<loc0529>', '<loc0530>', '<loc0531>', '<loc0532>', '<loc0533>', '<loc0534>', '<loc0535>', '<loc0536>', '<loc0537>', '<loc0538>', '<loc0539>', '<loc0540>', '<loc0541>', '<loc0542>', '<loc0543>', '<loc0544>', '<loc0545>', '<loc0546>', '<loc0547>', '<loc0548>', '<loc0549>', '<loc0550>', '<loc0551>', '<loc0552>', '<loc0553>', '<loc0554>', '<loc0555>', '<loc0556>', '<loc0557>', '<loc0558>', '<loc0559>', '<loc0560>', '<loc0561>', '<loc0562>', '<loc0563>', '<loc0564>', '<loc0565>', '<loc0566>', '<loc0567>', '<loc0568>', '<loc0569>', '<loc0570>', '<loc0571>', '<loc0572>', '<loc0573>', '<loc0574>', '<loc0575>', '<loc0576>', '<loc0577>', '<loc0578>', '<loc0579>', '<loc0580>', '<loc0581>', '<loc0582>', '<loc0583>', '<loc0584>', '<loc0585>', '<loc0586>', '<loc0587>', '<loc0588>', '<loc0589>', '<loc0590>', '<loc0591>', '<loc0592>', '<loc0593>', '<loc0594>', '<loc0595>', '<loc0596>', '<loc0597>', '<loc0598>', '<loc0599>', '<loc0600>', '<loc0601>', '<loc0602>', '<loc0603>', '<loc0604>', '<loc0605>', '<loc0606>', '<loc0607>', '<loc0608>', '<loc0609>', '<loc0610>', '<loc0611>', '<loc0612>', '<loc0613>', '<loc0614>', '<loc0615>', '<loc0616>', '<loc0617>', '<loc0618>', '<loc0619>', '<loc0620>', '<loc0621>', '<loc0622>', '<loc0623>', '<loc0624>', '<loc0625>', '<loc0626>', '<loc0627>', '<loc0628>', '<loc0629>', '<loc0630>', '<loc0631>', '<loc0632>', '<loc0633>', '<loc0634>', '<loc0635>', '<loc0636>', '<loc0637>', '<loc0638>', '<loc0639>', '<loc0640>', '<loc0641>', '<loc0642>', '<loc0643>', '<loc0644>', '<loc0645>', '<loc0646>', '<loc0647>', '<loc0648>', '<loc0649>', '<loc0650>', '<loc0651>', '<loc0652>', '<loc0653>', '<loc0654>', '<loc0655>', '<loc0656>', '<loc0657>', '<loc0658>', '<loc0659>', '<loc0660>', '<loc0661>', '<loc0662>', '<loc0663>', '<loc0664>', '<loc0665>', '<loc0666>', '<loc0667>', '<loc0668>', '<loc0669>', '<loc0670>', '<loc0671>', '<loc0672>', '<loc0673>', '<loc0674>', '<loc0675>', '<loc0676>', '<loc0677>', '<loc0678>', '<loc0679>', '<loc0680>', '<loc0681>', '<loc0682>', '<loc0683>', '<loc0684>', '<loc0685>', '<loc0686>', '<loc0687>', '<loc0688>', '<loc0689>', '<loc0690>', '<loc0691>', '<loc0692>', '<loc0693>', '<loc0694>', '<loc0695>', '<loc0696>', '<loc0697>', '<loc0698>', '<loc0699>', '<loc0700>', '<loc0701>', '<loc0702>', '<loc0703>', '<loc0704>', '<loc0705>', '<loc0706>', '<loc0707>', '<loc0708>', '<loc0709>', '<loc0710>', '<loc0711>', '<loc0712>', '<loc0713>', '<loc0714>', '<loc0715>', '<loc0716>', '<loc0717>', '<loc0718>', '<loc0719>', '<loc0720>', '<loc0721>', '<loc0722>', '<loc0723>', '<loc0724>', '<loc0725>', '<loc0726>', '<loc0727>', '<loc0728>', '<loc0729>', '<loc0730>', '<loc0731>', '<loc0732>', '<loc0733>', '<loc0734>', '<loc0735>', '<loc0736>', '<loc0737>', '<loc0738>', '<loc0739>', '<loc0740>', '<loc0741>', '<loc0742>', '<loc0743>', '<loc0744>', '<loc0745>', '<loc0746>', '<loc0747>', '<loc0748>', '<loc0749>', '<loc0750>', '<loc0751>', '<loc0752>', '<loc0753>', '<loc0754>', '<loc0755>', '<loc0756>', '<loc0757>', '<loc0758>', '<loc0759>', '<loc0760>', '<loc0761>', '<loc0762>', '<loc0763>', '<loc0764>', '<loc0765>', '<loc0766>', '<loc0767>', '<loc0768>', '<loc0769>', '<loc0770>', '<loc0771>', '<loc0772>', '<loc0773>', '<loc0774>', '<loc0775>', '<loc0776>', '<loc0777>', '<loc0778>', '<loc0779>', '<loc0780>', '<loc0781>', '<loc0782>', '<loc0783>', '<loc0784>', '<loc0785>', '<loc0786>', '<loc0787>', '<loc0788>', '<loc0789>', '<loc0790>', '<loc0791>', '<loc0792>', '<loc0793>', '<loc0794>', '<loc0795>', '<loc0796>', '<loc0797>', '<loc0798>', '<loc0799>', '<loc0800>', '<loc0801>', '<loc0802>', '<loc0803>', '<loc0804>', '<loc0805>', '<loc0806>', '<loc0807>', '<loc0808>', '<loc0809>', '<loc0810>', '<loc0811>', '<loc0812>', '<loc0813>', '<loc0814>', '<loc0815>', '<loc0816>', '<loc0817>', '<loc0818>', '<loc0819>', '<loc0820>', '<loc0821>', '<loc0822>', '<loc0823>', '<loc0824>', '<loc0825>', '<loc0826>', '<loc0827>', '<loc0828>', '<loc0829>', '<loc0830>', '<loc0831>', '<loc0832>', '<loc0833>', '<loc0834>', '<loc0835>', '<loc0836>', '<loc0837>', '<loc0838>', '<loc0839>', '<loc0840>', '<loc0841>', '<loc0842>', '<loc0843>', '<loc0844>', '<loc0845>', '<loc0846>', '<loc0847>', '<loc0848>', '<loc0849>', '<loc0850>', '<loc0851>', '<loc0852>', '<loc0853>', '<loc0854>', '<loc0855>', '<loc0856>', '<loc0857>', '<loc0858>', '<loc0859>', '<loc0860>', '<loc0861>', '<loc0862>', '<loc0863>', '<loc0864>', '<loc0865>', '<loc0866>', '<loc0867>', '<loc0868>', '<loc0869>', '<loc0870>', '<loc0871>', '<loc0872>', '<loc0873>', '<loc0874>', '<loc0875>', '<loc0876>', '<loc0877>', '<loc0878>', '<loc0879>', '<loc0880>', '<loc0881>', '<loc0882>', '<loc0883>', '<loc0884>', '<loc0885>', '<loc0886>', '<loc0887>', '<loc0888>', '<loc0889>', '<loc0890>', '<loc0891>', '<loc0892>', '<loc0893>', '<loc0894>', '<loc0895>', '<loc0896>', '<loc0897>', '<loc0898>', '<loc0899>', '<loc0900>', '<loc0901>', '<loc0902>', '<loc0903>', '<loc0904>', '<loc0905>', '<loc0906>', '<loc0907>', '<loc0908>', '<loc0909>', '<loc0910>', '<loc0911>', '<loc0912>', '<loc0913>', '<loc0914>', '<loc0915>', '<loc0916>', '<loc0917>', '<loc0918>', '<loc0919>', '<loc0920>', '<loc0921>', '<loc0922>', '<loc0923>', '<loc0924>', '<loc0925>', '<loc0926>', '<loc0927>', '<loc0928>', '<loc0929>', '<loc0930>', '<loc0931>', '<loc0932>', '<loc0933>', '<loc0934>', '<loc0935>', '<loc0936>', '<loc0937>', '<loc0938>', '<loc0939>', '<loc0940>', '<loc0941>', '<loc0942>', '<loc0943>', '<loc0944>', '<loc0945>', '<loc0946>', '<loc0947>', '<loc0948>', '<loc0949>', '<loc0950>', '<loc0951>', '<loc0952>', '<loc0953>', '<loc0954>', '<loc0955>', '<loc0956>', '<loc0957>', '<loc0958>', '<loc0959>', '<loc0960>', '<loc0961>', '<loc0962>', '<loc0963>', '<loc0964>', '<loc0965>', '<loc0966>', '<loc0967>', '<loc0968>', '<loc0969>', '<loc0970>', '<loc0971>', '<loc0972>', '<loc0973>', '<loc0974>', '<loc0975>', '<loc0976>', '<loc0977>', '<loc0978>', '<loc0979>', '<loc0980>', '<loc0981>', '<loc0982>', '<loc0983>', '<loc0984>', '<loc0985>', '<loc0986>', '<loc0987>', '<loc0988>', '<loc0989>', '<loc0990>', '<loc0991>', '<loc0992>', '<loc0993>', '<loc0994>', '<loc0995>', '<loc0996>', '<loc0997>', '<loc0998>', '<loc0999>', '<loc1000>', '<loc1001>', '<loc1002>', '<loc1003>', '<loc1004>', '<loc1005>', '<loc1006>', '<loc1007>', '<loc1008>', '<loc1009>', '<loc1010>', '<loc1011>', '<loc1012>', '<loc1013>', '<loc1014>', '<loc1015>', '<loc1016>', '<loc1017>', '<loc1018>', '<loc1019>', '<loc1020>', '<loc1021>', '<loc1022>', '<loc1023>', '<seg000>', '<seg001>', '<seg002>', '<seg003>', '<seg004>', '<seg005>', '<seg006>', '<seg007>', '<seg008>', '<seg009>', '<seg010>', '<seg011>', '<seg012>', '<seg013>', '<seg014>', '<seg015>', '<seg016>', '<seg017>', '<seg018>', '<seg019>', '<seg020>', '<seg021>', '<seg022>', '<seg023>', '<seg024>', '<seg025>', '<seg026>', '<seg027>', '<seg028>', '<seg029>', '<seg030>', '<seg031>', '<seg032>', '<seg033>', '<seg034>', '<seg035>', '<seg036>', '<seg037>', '<seg038>', '<seg039>', '<seg040>', '<seg041>', '<seg042>', '<seg043>', '<seg044>', '<seg045>', '<seg046>', '<seg047>', '<seg048>', '<seg049>', '<seg050>', '<seg051>', '<seg052>', '<seg053>', '<seg054>', '<seg055>', '<seg056>', '<seg057>', '<seg058>', '<seg059>', '<seg060>', '<seg061>', '<seg062>', '<seg063>', '<seg064>', '<seg065>', '<seg066>', '<seg067>', '<seg068>', '<seg069>', '<seg070>', '<seg071>', '<seg072>', '<seg073>', '<seg074>', '<seg075>', '<seg076>', '<seg077>', '<seg078>', '<seg079>', '<seg080>', '<seg081>', '<seg082>', '<seg083>', '<seg084>', '<seg085>', '<seg086>', '<seg087>', '<seg088>', '<seg089>', '<seg090>', '<seg091>', '<seg092>', '<seg093>', '<seg094>', '<seg095>', '<seg096>', '<seg097>', '<seg098>', '<seg099>', '<seg100>', '<seg101>', '<seg102>', '<seg103>', '<seg104>', '<seg105>', '<seg106>', '<seg107>', '<seg108>', '<seg109>', '<seg110>', '<seg111>', '<seg112>', '<seg113>', '<seg114>', '<seg115>', '<seg116>', '<seg117>', '<seg118>', '<seg119>', '<seg120>', '<seg121>', '<seg122>', '<seg123>', '<seg124>', '<seg125>', '<seg126>', '<seg127>']  # fmt: skip
 
 
 # Copied from transformers.models.idefics2.processing_idefics2.is_url
@@ -63,7 +70,7 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token):
         image_seq_len (`int`): The length of the image sequence.
         image_token (`str`): The image token.
     """
-    return f"{image_token * image_seq_len}{bos_token}{prompt}"
+    return f"{image_token * image_seq_len}{bos_token}{prompt}\n"
 
 
 class PaliGemmaProcessor(ProcessorMixin):
@@ -84,7 +91,11 @@ class PaliGemmaProcessor(ProcessorMixin):
     image_processor_class = "SiglipImageProcessor"
     tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
 
-    def __init__(self, image_processor=None, tokenizer=None):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+    ):
         if image_processor is None:
             raise ValueError("You need to specify an `image_processor`.")
         if tokenizer is None:
@@ -97,7 +108,10 @@ class PaliGemmaProcessor(ProcessorMixin):
         image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
         tokens_to_add = {"additional_special_tokens": [image_token]}
         tokenizer.add_special_tokens(tokens_to_add)
+        tokenizer.add_tokens(EXTRA_TOKENS)
         self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+        tokenizer.add_bos_token = False
+        tokenizer.add_eos_token = False
 
         super().__init__(image_processor, tokenizer)
 
@@ -115,12 +129,15 @@ class PaliGemmaProcessor(ProcessorMixin):
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
-        input_data_format: Optional[Union[str, "ChannelDimension"]] = None,  # noqa: F821
+        input_data_format: Optional[
+            Union[str, "ChannelDimension"]  # noqa: F821
+        ] = None,
         resample: "PILImageResampling" = None,  # noqa: F821
         do_convert_rgb: bool = None,
         do_thumbnail: bool = None,
         do_align_long_axis: bool = None,
         do_rescale: bool = None,
+        suffix: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -129,6 +146,25 @@ class PaliGemmaProcessor(ProcessorMixin):
         SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
         of the above two methods for more information.
 
+        The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
+        the prompt in `text`, and will be placed after the prompt. This is because attention is handled differently for
+        the prefix and the suffix. For instance,
+        ```python
+        image = PIL_cow_image
+        prompt = "answer en Where is the cow standing?"
+        suffix = "on the beach"
+        inputs = processor(text=prompt, images=image, suffix=suffix)
+        ```
+        Here `inputs` will contain the `input_ids` and `token_type_ids` that follow
+        ```python
+        inputs["input_ids"][:, 256:]
+        # tensor([[     2,   6006,    603,    573,  13910,   9980, 235336,    108,    477,   573,   8318]])
+        inputs["token_type_ids"][:, 256:]
+        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]])
+        ```
+        Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type.
+
+
         Args:
             text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
@@ -160,16 +196,24 @@ class PaliGemmaProcessor(ProcessorMixin):
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return NumPy `np.ndarray` objects.
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
+            suffix (`str`, `List[str]`, `List[List[str]]`):
+                The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
+                for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
+              is provided, the `input_ids` will also contain the suffix input ids.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **labels** -- Labels compatible with training if `suffix` is not None
         """
+
+        return_token_type_ids = True if suffix is not None else False
+
         if images is None:
             raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
         if text is None:
@@ -187,6 +231,11 @@ class PaliGemmaProcessor(ProcessorMixin):
             text = [text]
         elif isinstance(text, list) and _is_str_or_image(text[0]):
             pass
+        if suffix is not None and _is_str_or_image(suffix):
+            suffix = [suffix]
+        if suffix is not None:
+            suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
+
         input_strings = [
             build_string_from_input(
                 prompt=prompt,
@@ -213,36 +262,22 @@ class PaliGemmaProcessor(ProcessorMixin):
         if max_length is not None:
             max_length += self.image_seq_length  # max_length has to account for the image tokens
 
-        if tokenize_newline_separately:
-            inputs = self.tokenizer(
-                input_strings,
-                add_special_tokens=False,
-                return_tensors=None,
-                padding="do_not_pad",
-                max_length=max_length,
-                truncation=truncation,
-            )
-            newline_token = self.tokenizer.convert_tokens_to_ids("\n")
-            concatenated_ids = [ids + [newline_token] for ids in inputs["input_ids"]]
-            concatenated_attention_masks = [mask + [1] for mask in inputs["attention_mask"]]
+        inputs = self.tokenizer(
+            input_strings,
+            text_pair=suffix,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            return_token_type_ids=return_token_type_ids,
+        )
 
-            text_inputs = self.tokenizer.pad(
-                {"input_ids": concatenated_ids, "attention_mask": concatenated_attention_masks},
-                max_length=max_length,
-                padding=padding,
-                return_tensors=return_tensors,
-            )
-        else:
-            text_inputs = self.tokenizer(
-                input_strings,
-                add_special_tokens=False,
-                return_tensors=return_tensors,
-                padding=padding,
-                max_length=max_length,
-                truncation=truncation,
-            )
+        return_data = {**inputs, "pixel_values": pixel_values}
 
-        return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
+        if return_token_type_ids:
+            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+            return_data.update({"labels": labels})
+        return BatchFeature(data=return_data)
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
     def batch_decode(self, *args, **kwargs):
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index a5f38842cc..10fd48060a 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -163,6 +163,8 @@ class PaliGemmaVisionText2TextModelTester:
             "pixel_values": pixel_values,
             "input_ids": input_ids,
             "attention_mask": attention_mask,
+            "labels": input_ids,
+            "token_type_ids": torch.zeros_like(input_ids),
         }
         return config, inputs_dict
 

From 15585b81a525e3878321df4b13404c96690a2469 Mon Sep 17 00:00:00 2001
From: Pavel Iakubovskii <qubvel@gmail.com>
Date: Wed, 22 May 2024 23:13:56 +0000
Subject: [PATCH 44/80] Update object detection with latest resize and pad
 strategies (#30955)

* Update with new resizing and pad strategy

* Return pixel mask param

* Update inference in guide

* Fix empty compose

* Update guide
---
 docs/source/en/tasks/object_detection.md      | 57 ++++++-------------
 examples/pytorch/object-detection/README.md   |  3 +-
 .../object-detection/run_object_detection.py  | 25 ++++----
 .../run_object_detection_no_trainer.py        | 25 ++++----
 4 files changed, 43 insertions(+), 67 deletions(-)

diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index aebac36c90..39362b4615 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -206,10 +206,10 @@ Instantiate the image processor from the same checkpoint as the model you want t
 
 >>> image_processor = AutoImageProcessor.from_pretrained(
 ...     MODEL_NAME,
-...     # At this moment we recommend using external transform to pad and resize images.
-...     # It`s faster and yields better results for object-detection models.
-...     do_pad=False,
-...     do_resize=False,
+...     do_resize=True,
+...     size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
+...     do_pad=True,
+...     pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
 ... )
 ```
 
@@ -217,38 +217,28 @@ Before passing the images to the `image_processor`, apply two preprocessing tran
 - Augmenting images
 - Reformatting annotations to meet DETR expectations
 
-First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/) ...
+First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. Here we use [Albumentations](https://albumentations.ai/docs/).
 This library ensures that transformations affect the image and update the bounding boxes accordingly.
 The 🤗 Datasets library documentation has a detailed [guide on how to augment images for object detection](https://huggingface.co/docs/datasets/object_detection),
-and it uses the exact same dataset as an example. Apply the same approach here, resize each image to (480, 480),
-flip it horizontally, and brighten it. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).
+and it uses the exact same dataset as an example. Apply some geometric and color transformations to the image. For additional augmentation options, explore the [Albumentations Demo Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo).
 
 ```py
 >>> import albumentations as A
 
 >>> max_size = IMAGE_SIZE
 
->>> # Resize image longest edge to 480 and then pad image to square 480x480.
->>> # This padding and resizing strategy give better results, see
->>> # https://github.com/huggingface/transformers/pull/30422#discussion_r1584647408
->>> basic_transforms = [
-...     A.LongestMaxSize(max_size=max_size),
-...     A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
-... ]
-
 >>> train_augment_and_transform = A.Compose(
 ...     [
 ...         A.Perspective(p=0.1),
 ...         A.HorizontalFlip(p=0.5),
 ...         A.RandomBrightnessContrast(p=0.5),
 ...         A.HueSaturationValue(p=0.1),
-...         *basic_transforms,
 ...     ],
 ...     bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
 ... )
 
 >>> validation_transform = A.Compose(
-...     basic_transforms,
+...     [A.NoOp()],
 ...     bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
 ... )
 ```
@@ -294,7 +284,7 @@ The `image_processor` expects the annotations to be in the following format: `{'
 Now you can combine the image and annotation transformations to use on a batch of examples:
 
 ```py
->>> def augment_and_transform_batch(examples, transform, image_processor):
+>>> def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
 ...     """Apply augmentations and format annotations in COCO format for object detection task"""
 
 ...     images = []
@@ -315,6 +305,9 @@ Now you can combine the image and annotation transformations to use on a batch o
 ...     # Apply the image processor transformations: resizing, rescaling, normalization
 ...     result = image_processor(images=images, annotations=annotations, return_tensors="pt")
 
+...     if not return_pixel_mask:
+...         result.pop("pixel_mask", None)
+
 ...     return result
 ```
 
@@ -1485,25 +1478,12 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin
 ```py
 >>> import torch
 >>> import requests
->>> import numpy as np
->>> import albumentations as A
 
->>> from PIL import Image
+>>> from PIL import Image, ImageDraw
 >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
 
 >>> url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
 >>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> # Define transformations for inference
->>> resize_and_pad = A.Compose([
-...     A.LongestMaxSize(max_size=max_size),
-...     A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
-... ])
-
->>> # This one is for visualization with no padding
->>> resize_only = A.Compose([
-...     A.LongestMaxSize(max_size=max_size),
-... ])
 ```
 
 Load model and image processor from the Hugging Face Hub (skip to use already trained in this session):
@@ -1519,12 +1499,11 @@ Load model and image processor from the Hugging Face Hub (skip to use already tr
 And detect bounding boxes:
 
 ```py
->>> np_preprocessed_image = resize_and_pad(image=np.array(image))["image"]
 
 >>> with torch.no_grad():
-...     inputs = image_processor(images=[np_preprocessed_image], return_tensors="pt")
-...     outputs = model(inputs["pixel_values"].to(device))
-...     target_sizes = torch.tensor([np_preprocessed_image.shape[:2]])
+...     inputs = image_processor(images=[image], return_tensors="pt")
+...     outputs = model(**inputs.to(device))
+...     target_sizes = torch.tensor([[image.size[1], image.size[0]]])
 ...     results = image_processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=target_sizes)[0]
 
 >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
@@ -1543,9 +1522,7 @@ Detected Coverall with confidence 0.391 at location [68.61, 126.66, 309.03, 318.
 Let's plot the result:
 
 ```py
->>> resized_image = resize_only(image=np.array(image))["image"]
->>> resized_image = Image.fromarray(resized_image)
->>> draw = ImageDraw.Draw(resized_image)
+>>> draw = ImageDraw.Draw(image)
 
 >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
 ...     box = [round(i, 2) for i in box.tolist()]
@@ -1553,7 +1530,7 @@ Let's plot the result:
 ...     draw.rectangle((x, y, x2, y2), outline="red", width=1)
 ...     draw.text((x, y), model.config.id2label[label.item()], fill="white")
 
->>> resized_image
+>>> image
 ```
 
 <div class="flex justify-center">
diff --git a/examples/pytorch/object-detection/README.md b/examples/pytorch/object-detection/README.md
index 5150a76b78..ab474f7607 100644
--- a/examples/pytorch/object-detection/README.md
+++ b/examples/pytorch/object-detection/README.md
@@ -50,7 +50,7 @@ python run_object_detection.py \
     --per_device_train_batch_size 8 \
     --gradient_accumulation_steps 1 \
     --remove_unused_columns false \
-    --eval_do_concat_batches false \ 
+    --eval_do_concat_batches false \
     --ignore_mismatched_sizes true \
     --metric_for_best_model eval_map \
     --greater_is_better true \
@@ -200,6 +200,7 @@ Where `metadata.jsonl` is a file with the following structure:
 {"file_name": "0002.jpg", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "categories": [1], "id": [2], "area": [40.0]}}
 ...
 ```
+Trining script support bounding boxes in COCO format (x_min, y_min, width, height).
 
 Then, you cat load the dataset with just a few lines of code:
 
diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py
index a3849e195f..62e60acc72 100644
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@@ -117,7 +117,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
 
 
 def augment_and_transform_batch(
-    examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
+    examples: Mapping[str, Any],
+    transform: A.Compose,
+    image_processor: AutoImageProcessor,
+    return_pixel_mask: bool = False,
 ) -> BatchFeature:
     """Apply augmentations and format annotations in COCO format for object detection task"""
 
@@ -139,6 +142,9 @@ def augment_and_transform_batch(
     # Apply the image processor transformations: resizing, rescaling, normalization
     result = image_processor(images=images, annotations=annotations, return_tensors="pt")
 
+    if not return_pixel_mask:
+        result.pop("pixel_mask", None)
+
     return result
 
 
@@ -415,12 +421,10 @@ def main():
     )
     image_processor = AutoImageProcessor.from_pretrained(
         model_args.image_processor_name or model_args.model_name_or_path,
-        # At this moment we recommend using external transform to pad and resize images.
-        # It`s faster and yields much better results for object-detection models.
-        do_pad=False,
-        do_resize=False,
-        # We will save image size parameter in config just for reference
-        size={"longest_edge": data_args.image_square_size},
+        do_resize=True,
+        size={"max_height": data_args.image_square_size, "max_width": data_args.image_square_size},
+        do_pad=True,
+        pad_size={"height": data_args.image_square_size, "width": data_args.image_square_size},
         **common_pretrained_args,
     )
 
@@ -428,10 +432,6 @@ def main():
     # Define image augmentations and dataset transforms
     # ------------------------------------------------------------------------------------------------
     max_size = data_args.image_square_size
-    basic_transforms = [
-        A.LongestMaxSize(max_size=max_size),
-        A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
-    ]
     train_augment_and_transform = A.Compose(
         [
             A.Compose(
@@ -453,12 +453,11 @@ def main():
             A.HorizontalFlip(p=0.5),
             A.RandomBrightnessContrast(p=0.5),
             A.HueSaturationValue(p=0.1),
-            *basic_transforms,
         ],
         bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
     )
     validation_transform = A.Compose(
-        basic_transforms,
+        [A.NoOp()],
         bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
     )
 
diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
index f0889f3d28..8bea58aa50 100644
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@@ -120,7 +120,10 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
 
 # Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
 def augment_and_transform_batch(
-    examples: Mapping[str, Any], transform: A.Compose, image_processor: AutoImageProcessor
+    examples: Mapping[str, Any],
+    transform: A.Compose,
+    image_processor: AutoImageProcessor,
+    return_pixel_mask: bool = False,
 ) -> BatchFeature:
     """Apply augmentations and format annotations in COCO format for object detection task"""
 
@@ -142,6 +145,9 @@ def augment_and_transform_batch(
     # Apply the image processor transformations: resizing, rescaling, normalization
     result = image_processor(images=images, annotations=annotations, return_tensors="pt")
 
+    if not return_pixel_mask:
+        result.pop("pixel_mask", None)
+
     return result
 
 
@@ -473,12 +479,10 @@ def main():
     )
     image_processor = AutoImageProcessor.from_pretrained(
         args.model_name_or_path,
-        # At this moment we recommend using external transform to pad and resize images.
-        # It`s faster and yields much better results for object-detection models.
-        do_pad=False,
-        do_resize=False,
-        # We will save image size parameter in config just for reference
-        size={"longest_edge": args.image_square_size},
+        do_resize=True,
+        size={"max_height": args.image_square_size, "max_width": args.image_square_size},
+        do_pad=True,
+        pad_size={"height": args.image_square_size, "width": args.image_square_size},
         **common_pretrained_args,
     )
 
@@ -486,10 +490,6 @@ def main():
     # Define image augmentations and dataset transforms
     # ------------------------------------------------------------------------------------------------
     max_size = args.image_square_size
-    basic_transforms = [
-        A.LongestMaxSize(max_size=max_size),
-        A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"),
-    ]
     train_augment_and_transform = A.Compose(
         [
             A.Compose(
@@ -511,12 +511,11 @@ def main():
             A.HorizontalFlip(p=0.5),
             A.RandomBrightnessContrast(p=0.5),
             A.HueSaturationValue(p=0.1),
-            *basic_transforms,
         ],
         bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
     )
     validation_transform = A.Compose(
-        basic_transforms,
+        [A.NoOp()],
         bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
     )
 

From eb1a77bbb0a62d721e9a02e67b7e4f9e5afca08b Mon Sep 17 00:00:00 2001
From: Kamil Akesbi <45195979+kamilakesbi@users.noreply.github.com>
Date: Thu, 23 May 2024 10:59:38 +0200
Subject: [PATCH 45/80] Using assistant in AutomaticSpeechRecognitionPipeline
 with different encoder size (#30637)

* fiw input to generate in pipeline

* fixup

* pass input_features to generate with assistant

* error if model and assistant with different enc size

* fix

* apply review suggestions

* use self.config.is_encoder_decoder

* pass inputs to generate directly

* add slow tests

* Update src/transformers/generation/utils.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* Update tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>

* apply review

* Update src/transformers/generation/utils.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/pipelines/test_pipelines_automatic_speech_recognition.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* apply code review

* update attributes encoder_xyz to check

* Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>

* add slow test

* solve conflicts

---------

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 src/transformers/generation/utils.py          | 20 ++++
 .../pipelines/automatic_speech_recognition.py |  8 +-
 tests/generation/test_utils.py                | 62 +++++++++++++
 ..._pipelines_automatic_speech_recognition.py | 91 +++++++++++++++++++
 4 files changed, 174 insertions(+), 7 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index ab8e601906..149ce144e6 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -1097,6 +1097,25 @@ class GenerationMixin:
                 exception_message += f" Please use one of the following classes instead: {generate_compatible_classes}"
             raise TypeError(exception_message)
 
+    def _validate_assistant(self, assistant_model):
+        if assistant_model is None:
+            return
+
+        if self.config.is_encoder_decoder and not assistant_model.config.is_encoder_decoder:
+            attributes_to_check = ["encoder_attention_heads", "encoder_ffn_dim", "encoder_layers"]
+            attributes_to_check = [attr for attr in dir(assistant_model.config) if attr in attributes_to_check]
+            are_equal = all(
+                getattr(self.config, attr) == getattr(assistant_model.config, attr) for attr in attributes_to_check
+            )
+            if not are_equal:
+                raise ValueError(
+                    "The main model and the assistant don't have compatible encoder-dependent input shapes. "
+                    "Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper."
+                )
+
+        if not self.config.vocab_size == assistant_model.config.vocab_size:
+            raise ValueError("Make sure the main and assistant model use the same tokenizer")
+
     def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
         """Validates model kwargs for generation. Generate argument typos will also be caught here."""
         # If a `Cache` instance is passed, checks whether the model is compatible with it
@@ -1547,6 +1566,7 @@ class GenerationMixin:
         tokenizer = kwargs.pop("tokenizer", None)  # Pull this out first, we only use it for stopping criteria
         generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         self._validate_model_kwargs(model_kwargs.copy())
+        self._validate_assistant(assistant_model)
 
         # 2. Set generation parameters if not already defined
         if synced_gpus is None:
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 2e8682d96a..01faab6d74 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -474,7 +474,6 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             raise ValueError("num_frames must be used only when stride is None")
 
         if self.type in {"seq2seq", "seq2seq_whisper"}:
-            encoder = self.model.get_encoder()
             # Consume values so we can let extra information flow freely through
             # the pipeline (important for `partial` in microphone)
             if "input_features" in model_inputs:
@@ -499,16 +498,11 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
                             generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
                         else:
                             generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
-
                     else:
                         generate_kwargs["num_frames"] = num_frames
 
-            if self.type == "seq2seq_whisper" and inputs.shape[-1] > self.feature_extractor.nb_max_frames:
-                generate_kwargs["input_features"] = inputs
-            else:
-                generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
-
             tokens = self.model.generate(
+                inputs=inputs,
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index b8e90a5b8e..840b64e17d 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -45,6 +45,7 @@ if is_torch_available():
         AutoModelForSeq2SeqLM,
         AutoModelForSpeechSeq2Seq,
         AutoModelForVision2Seq,
+        AutoProcessor,
         AutoTokenizer,
         BartForCausalLM,
         BartForConditionalGeneration,
@@ -2919,6 +2920,67 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi
         # update_candidate_strategy is called once but assistant_model.generation_config.num_assistant_tokens should stay 5
         self.assertEqual(assistant_model.generation_config.num_assistant_tokens, 5)
 
+    @slow
+    def test_validate_assistant(self):
+        # Generate a random sample:
+        inputs = np.random.rand(160000)
+
+        # Load a main encoder-decoder model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        )
+        model.to(torch_device)
+
+        # process the input:
+        features = processor(inputs, return_tensors="pt").to(torch_device)
+
+        # Load an encoder-decoder assistant with same encoder as the main model:
+        assistant_distil_model_id = "distil-whisper/distil-large-v2"
+        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_distil_model_id,
+            use_safetensors=True,
+        ).to(torch_device)
+        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
+
+        # Load its decoder only version:
+        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
+            assistant_distil_model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        ).to(torch_device)
+        self.assertTrue(model.generate(**features, assistant_model=assistant_causal_lm).sum())
+
+        # Load an encoder-decoder assistant with a different encoder than the main model:
+        assistant_distil_model_id = "openai/whisper-tiny"
+        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_distil_model_id,
+            use_safetensors=True,
+        ).to(torch_device)
+        self.assertTrue(model.generate(**features, assistant_model=assistant_seq_to_seq).sum())
+
+        # Load its decoder only version:
+        assistant_causal_lm = AutoModelForCausalLM.from_pretrained(
+            assistant_distil_model_id,
+            low_cpu_mem_usage=True,
+            use_safetensors=True,
+        ).to(torch_device)
+        # It will raise an error as the encoder of the main and assistant model are not compatible:
+        with self.assertRaises(ValueError):
+            model.generate(**features, assistant_model=assistant_causal_lm)
+
+        # Load an encoder-decoder model with a different tokenizer than the main model:
+        assistant_distil_model_id = "hf-internal-testing/tiny-random-SeamlessM4Tv2ForSpeechToText"
+        assistant_seq_to_seq = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_distil_model_id,
+        ).to(torch_device)
+        # This should raise an error as the main and assistant model don't use the same tokenizer:
+        with self.assertRaises(ValueError):
+            model.generate(**features, assistant_model=assistant_seq_to_seq)
+
     def test_compare_unprocessed_logit_scores(self):
         # Get unprocessed logit scores back from model generate function.
         # Assert that unprocessed logits from generate() are same as those from modal eval()
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 5ab18e81d5..430666990f 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import time
 import unittest
 
 import numpy as np
@@ -23,6 +24,8 @@ from transformers import (
     MODEL_FOR_CTC_MAPPING,
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
     AutoFeatureExtractor,
+    AutoModelForCausalLM,
+    AutoModelForSpeechSeq2Seq,
     AutoProcessor,
     AutoTokenizer,
     Speech2TextForConditionalGeneration,
@@ -1138,6 +1141,94 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
             {"text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel."},
         )
 
+    @slow
+    def test_speculative_decoding_whisper_non_distil(self):
+        # Load data:
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        sample = dataset[0]["audio"]
+
+        # Load model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            use_safetensors=True,
+        )
+
+        # Load assistant:
+        assistant_model_id = "openai/whisper-tiny"
+        assistant_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            assistant_model_id,
+            use_safetensors=True,
+        )
+
+        # Load pipeline:
+        pipe = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            generate_kwargs={"language": "en"},
+        )
+
+        start_time = time.time()
+        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
+        total_time_assist = time.time() - start_time
+
+        start_time = time.time()
+        transcription_ass = pipe(sample)["text"]
+        total_time_non_assist = time.time() - start_time
+
+        self.assertEqual(transcription_ass, transcription_non_ass)
+        self.assertEqual(
+            transcription_ass,
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
+        )
+        self.assertTrue(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
+
+    @slow
+    def test_speculative_decoding_whisper_distil(self):
+        # Load data:
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:1]")
+        sample = dataset[0]["audio"]
+
+        # Load model:
+        model_id = "openai/whisper-large-v2"
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id,
+            use_safetensors=True,
+        )
+
+        # Load assistant:
+        assistant_model_id = "distil-whisper/distil-large-v2"
+        assistant_model = AutoModelForCausalLM.from_pretrained(
+            assistant_model_id,
+            use_safetensors=True,
+        )
+
+        # Load pipeline:
+        pipe = AutomaticSpeechRecognitionPipeline(
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            generate_kwargs={"language": "en"},
+        )
+
+        start_time = time.time()
+        transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"]
+        total_time_assist = time.time() - start_time
+
+        start_time = time.time()
+        transcription_ass = pipe(sample)["text"]
+        total_time_non_assist = time.time() - start_time
+
+        self.assertEqual(transcription_ass, transcription_non_ass)
+        self.assertEqual(
+            transcription_ass,
+            " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.",
+        )
+        self.assertEqual(total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster")
+
     @slow
     @require_torch
     @require_torchaudio

From 4ef85fee718969f1703d7dffa134deb72f4de828 Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Thu, 23 May 2024 11:45:31 +0200
Subject: [PATCH 46/80] Push ci image (#30982)

* [build-ci-image]

* correct branch

* push ci image

* [build-ci-image]

* update scheduled as well

* [push-ci-image]

* [build-ci-image]

* [push-ci-image]

* update deps

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* oups [build-ci-image]

* [push-ci-image]

* fix

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* updated

* [build-ci-image] update tag

* [build-ci-image]

* [build-ci-image]

* fix tag

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* [build-ci-image]

* github name

* commit_title?

* fetch

* update

* it not found

* dev

* dev

* [push-ci-image]

* dev

* dev

* update

* dev

* dev print dev commit message dev

* dev ? dev

* dev

* dev

* dev

* dev

* [build-ci-image]

* [build-ci-image]

* [push-ci-image]

* revert unwanted

* revert convert as well

* no you are not important

* [build-ci-image]

* Update .circleci/config.yml

* pin tf probability dev

* [push-ci-image] skip

* [push-ci-image] test

* [push-ci-image]

* fix

* device
---
 .circleci/create_circleci_config.py          | 2 +-
 .github/workflows/build-ci-docker-images.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 1427b08a96..3f2c6df394 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -75,7 +75,7 @@ class CircleCIJob:
         else:
             # BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED
             print(os.environ.get("GIT_COMMIT_MESSAGE"))
-            if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or "dev" in  os.environ.get("GIT_COMMIT_MESSAGE", ""):
+            if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci":
                 self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev"
             print(f"Using {self.docker_image} docker image")
         if self.install_steps is None:
diff --git a/.github/workflows/build-ci-docker-images.yml b/.github/workflows/build-ci-docker-images.yml
index 6d6df220e4..6f29df8276 100644
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@@ -33,7 +33,7 @@ jobs:
       - 
         name: Set tag
         run: |
-              if [ ${{contains(github.event.head_commit.message, '[build-ci-image]')}} ]; then
+              if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
                   echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV" 
                   echo "setting it to DEV!"
               else

From e05baad86119efcbf1e2dae64eef0c29f589b07b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 23 May 2024 12:38:00 +0100
Subject: [PATCH 47/80] Bump requests from 2.31.0 to 2.32.2 in
 /examples/research_projects/visual_bert (#30983)

Bump requests in /examples/research_projects/visual_bert

Bumps [requests](https://github.com/psf/requests) from 2.31.0 to 2.32.2.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.31.0...v2.32.2)

---
updated-dependencies:
- dependency-name: requests
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 examples/research_projects/visual_bert/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt
index 60a1858b4e..1a3c80e086 100644
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@@ -75,7 +75,7 @@ pyzmq==19.0.2
 qtconsole==4.7.7
 QtPy==1.9.0
 regex==2020.7.14
-requests==2.31.0
+requests==2.32.2
 retrying==1.3.3
 sacremoses==0.0.43
 Send2Trash==1.5.0

From d583f1317be422128b7bb56984720387b3bbcb35 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan@huggingface.co>
Date: Thu, 23 May 2024 17:25:20 +0500
Subject: [PATCH 48/80] Quantized KV Cache (#30483)

* clean-up

* Update src/transformers/cache_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/cache_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Update src/transformers/cache_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* fixup

* Update tests/quantization/quanto_integration/test_quanto.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* Update src/transformers/generation/configuration_utils.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* more suggestions

* mapping if torch available

* run tests & add 'support_quantized' flag

* fix jamba test

* revert, will be fixed by another PR

* codestyle

* HQQ and versatile cache classes

* final update

* typo

* make tests happy

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 docker/transformers-all-latest-gpu/Dockerfile |   3 +
 docs/source/en/generation_strategies.md       |  37 ++
 docs/source/en/internal/generation_utils.md   |  16 +-
 src/transformers/__init__.py                  |  24 +-
 src/transformers/cache_utils.py               | 370 +++++++++++++++++-
 .../generation/configuration_utils.py         |  35 +-
 src/transformers/generation/utils.py          |  67 +++-
 src/transformers/modeling_utils.py            |   3 +
 .../models/cohere/modeling_cohere.py          |   1 +
 src/transformers/models/dbrx/modeling_dbrx.py |   1 +
 .../models/gemma/modeling_gemma.py            |   1 +
 .../models/llama/modeling_llama.py            |   1 +
 src/transformers/models/olmo/modeling_olmo.py |   1 +
 .../modeling_recurrent_gemma.py               |   2 +
 .../models/stablelm/modeling_stablelm.py      |   1 +
 .../models/whisper/generation_whisper.py      |  10 +-
 src/transformers/utils/dummy_pt_objects.py    |  35 ++
 tests/generation/test_utils.py                |  36 +-
 .../quanto_integration/test_quanto.py         |  36 +-
 19 files changed, 652 insertions(+), 28 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index f6fa587fb6..930fdfb799 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -51,6 +51,9 @@ RUN python3 -m pip install --no-cache-dir gguf
 # Some slow tests require bnb
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
+# Some tests require quanto
+RUN python3 -m pip install --no-cache-dir quanto
+
 # For `dinat` model
 # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
 RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 6c7c70cb14..b000cc0677 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -174,6 +174,43 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
 ```
 
 
+## KV Cache Quantization
+
+The `generate()` method supports caching keys and values to enhance efficiency and avoid re-computations. However the key and value
+cache can occupy a large portion of memory, becoming a bottleneck for long-context generation, especially for Large Language Models.
+Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed. 
+
+KV Cache quantization in `transformers` is largely inspired by the paper [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache]
+(https://arxiv.org/abs/2402.02750) and currently supports `quanto` and `HQQ` as backends. For more information on the inner workings see the paper.
+
+To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`.
+Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`QuantizedCacheConfig`] class.
+One has to indicate which quantization backend to use in the [`QuantizedCacheConfig`], the default is `quanto`.
+
+<Tip warning={true}>
+
+Cache quantization can be detrimental if the context length is short and there is enough GPU VRAM available to run without cache quantization.
+
+</Tip>
+
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. It's a great way to express myself and rel
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
+```
+
 ## Watermarking
 
 The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green". 
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index 04a4428a00..5bf8b5c4a0 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -360,6 +360,12 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 [[autodoc]] Cache
     - update
 
+[[autodoc]] CacheConfig
+	- update
+
+[[autodoc]] QuantizedCacheConfig
+	- validate
+
 [[autodoc]] DynamicCache
     - update
     - get_seq_length
@@ -367,6 +373,14 @@ A [`Constraint`] can be used to force the generation to include specific tokens
     - to_legacy_cache
     - from_legacy_cache
 
+[[autodoc]] QuantizedCache
+    - update
+    - get_seq_length
+
+[[autodoc]] QuantoQuantizedCache
+
+[[autodoc]] HQQQuantizedCache
+
 [[autodoc]] SinkCache
     - update
     - get_seq_length
@@ -375,7 +389,7 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 [[autodoc]] StaticCache
     - update
     - get_seq_length
-    - reorder_cache
+    - reset
 
 
 ## Watermark Utils
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 4255e30379..8da7a8b3e3 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1182,7 +1182,17 @@ else:
     _import_structure["activations"] = []
     _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"]
     _import_structure["benchmark.benchmark_args"] = ["PyTorchBenchmarkArguments"]
-    _import_structure["cache_utils"] = ["Cache", "DynamicCache", "SinkCache", "StaticCache"]
+    _import_structure["cache_utils"] = [
+        "Cache",
+        "CacheConfig",
+        "DynamicCache",
+        "HQQQuantizedCache",
+        "QuantizedCache",
+        "QuantizedCacheConfig",
+        "QuantoQuantizedCache",
+        "SinkCache",
+        "StaticCache",
+    ]
     _import_structure["data.datasets"] = [
         "GlueDataset",
         "GlueDataTrainingArguments",
@@ -5792,7 +5802,17 @@ if TYPE_CHECKING:
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
-        from .cache_utils import Cache, DynamicCache, SinkCache, StaticCache
+        from .cache_utils import (
+            Cache,
+            CacheConfig,
+            DynamicCache,
+            HQQQuantizedCache,
+            QuantizedCache,
+            QuantizedCacheConfig,
+            QuantoQuantizedCache,
+            SinkCache,
+            StaticCache,
+        )
         from .data.datasets import (
             GlueDataset,
             GlueDataTrainingArguments,
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index 990a863e18..ad91edfcbb 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -1,12 +1,21 @@
+import copy
+import json
+import os
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 
 from .configuration_utils import PretrainedConfig
-from .utils import logging
+from .utils import is_hqq_available, is_quanto_available, logging
 
 
+if is_quanto_available():
+    from quanto import QBitsTensor, qint2, qint4
+
+if is_hqq_available():
+    from hqq.core.quantize import Quantizer as HQQQuantizer
+
 logger = logging.get_logger(__name__)
 
 
@@ -82,6 +91,201 @@ class Cache:
             return None
 
 
+@dataclass
+class CacheConfig:
+    """
+    Base class for cache configs
+    """
+
+    cache_implementation: None
+
+    @classmethod
+    def from_dict(cls, config_dict, **kwargs):
+        """
+        Constructs a CacheConfig instance from a dictionary of parameters.
+        Args:
+            config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
+            **kwargs: Additional keyword arguments to override dictionary values.
+        Returns:
+            CacheConfig: Instance of CacheConfig constructed from the dictionary.
+        """
+        config = cls(**config_dict)
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+        return config
+
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_json_file
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `QuantizationConfig()` is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+            writer.write(json_string)
+
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_dict
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        return copy.deepcopy(self.__dict__)
+
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.__iter__
+    def __iter__(self):
+        """allows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixin"""
+        for attr, value in copy.deepcopy(self.__dict__).items():
+            yield attr, value
+
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.__repr__
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON formatted string.
+        Returns:
+            str: JSON formatted string representing the configuration instance.
+        """
+        return json.dumps(self.__dict__, indent=2) + "\n"
+
+    # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.update
+    def update(self, **kwargs):
+        """
+        Updates attributes of this class instance with attributes from `kwargs` if they match existing atributtes,
+        returning all the unused kwargs.
+
+        Args:
+            kwargs (`Dict[str, Any]`):
+                Dictionary of attributes to tentatively update this class.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
+        """
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+                to_remove.append(key)
+
+        # Remove all the attributes that were updated, without modifying the input dict
+        unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
+        return unused_kwargs
+
+
+@dataclass
+class QuantizedCacheConfig(CacheConfig):
+    """
+    Configuration class for quantized cache settings.
+
+    Attributes:
+        backend (`str`, *optional*, defaults to `"quanto"`):
+            Backend to use when performing quantization, Can be one of [`quanto`, `HQQ`]
+        nbits (`Optional[int]`, *optional*, defaults to 4):
+            Number of bits, can be 2 or 4 for the `quanto` backend and one of [1, 2, 3, 4, 8] for the `HQQ` backend. Defaults to 2.
+        axis_key (`int`, *optional*, defaults to 0):
+            Axis over which to perform grouping for the key tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend.
+        axis_value (`int`, *optional*, defaults to 0):
+            Axis over which to perform grouping for the value tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend.
+        q_group_size (`Optional[int]`, *optional*, defaults to 64):
+            Size of the quantization group, should be a divisor of the model's hidden dimension.
+            Defaults to 64.
+        residual_length (`Optional[int]`, *optional*, defaults to 128):
+            Length of the residual cache which will always be stored in original presicion.
+            Defaults to 128.
+        compute_dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
+            The defualt dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization.
+        device (`str`, *optional*, defaults to `"cpu"`):
+            Device on which to peform computations, should be same as the model's device.
+    """
+
+    def __init__(
+        self,
+        backend: str = "quanto",
+        nbits: Optional[int] = 4,
+        axis_key: Optional[int] = 0,
+        axis_value: Optional[int] = 0,
+        q_group_size: Optional[int] = 64,
+        residual_length: Optional[int] = 128,
+        compute_dtype: Optional[torch.dtype] = torch.float16,
+        device: Optional[str] = "cpu",
+    ):
+        self.backend = backend
+        self.nbits = nbits
+        self.axis_key = axis_key
+        self.axis_value = axis_value
+        self.q_group_size = q_group_size
+        self.residual_length = residual_length
+        self.compute_dtype = compute_dtype
+        self.device = device
+
+    def validate(self):
+        """Validates if the arguments passed are correct"""
+
+        incorrect_arg_msg = (
+            "Some of the keys in `cache_config` are defined incorrectly. `{key}` should be {correct_value}` "
+            "but found {found_value}"
+        )
+        # Check that the values are reasonable in general (nbits, axis)
+        # Later in QuantizedCache init we check if they are supported for that particular backend
+        if self.nbits not in [1, 2, 3, 4, 8]:
+            raise ValueError(
+                incorrect_arg_msg.format(
+                    key="nbits",
+                    correct_value="2 or 4 or 8",
+                    found_value=self.nbits,
+                ),
+            )
+        if self.q_group_size <= 0:
+            raise ValueError(
+                incorrect_arg_msg.format(
+                    key="q_group_size",
+                    correct_value="a positive integer",
+                    found_value=self.q_group_size,
+                ),
+            )
+        if self.residual_length < 0:
+            raise ValueError(
+                incorrect_arg_msg.format(
+                    key="residual_length",
+                    correct_value="a positive integer",
+                    found_value=self.residual_length,
+                ),
+            )
+
+        if self.axis_key not in [0, 1, -1]:
+            raise ValueError(
+                incorrect_arg_msg.format(
+                    key="axis_key",
+                    correct_value="`1` or `0`, `-1`",
+                    found_value=self.axis_key,
+                ),
+            )
+
+        if self.axis_value not in [0, 1, -1]:
+            raise ValueError(
+                incorrect_arg_msg.format(
+                    key="axis_value",
+                    correct_value="`1` or `0` or `-1`",
+                    found_value=self.axis_value,
+                ),
+            )
+
+
 class DynamicCache(Cache):
     """
     A cache that grows dynamically as more tokens are generated. This is the default for generative models.
@@ -186,6 +390,168 @@ class DynamicCache(Cache):
         return cache
 
 
+class QuantizedCache(DynamicCache):
+    """
+    A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://arxiv.org/abs/2402.02750).
+    It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization.
+
+    The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length` is set as a maximum capacity for the
+    original precision cache. When the length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache. The
+    quantization is done per-channel with a set `q_group_size` for both Keys and Values, in contrast to what was described in the paper.
+
+    It stores Keys and Values a list of quantized tensors (tuples in case we need to store metadata), one for each layer. Additionally, it stores the Key and
+    Value in original precision states as a list of tensors, one for each layer. The size of each tensor
+    is `[batch_size, num_heads, seq_len - residual_length, head_dim]`
+    """
+
+    def __init__(self, cache_config: QuantizedCacheConfig) -> None:
+        self._quantized_key_cache: List[torch.Tensor] = []
+        self._quantized_value_cache: List[torch.Tensor] = []
+
+        self.nbits = cache_config.nbits
+        self.residual_length = cache_config.residual_length
+        self.q_group_size = cache_config.q_group_size
+        self.axis_key = cache_config.axis_key
+        self.axis_value = cache_config.axis_value
+        self.compute_dtype = cache_config.compute_dtype
+        self.device = cache_config.device
+
+        super().__init__()
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += key_states.shape[-2]
+
+        if len(self.key_cache) <= layer_idx:
+            self._quantized_key_cache.append(self._quantize(key_states.contiguous(), axis=self.axis_key))
+            self._quantized_value_cache.append(self._quantize(value_states.contiguous(), axis=self.axis_value))
+            self.key_cache.append(torch.zeros(0, dtype=key_states.dtype, device=key_states.device))
+            self.value_cache.append(torch.zeros(0, dtype=key_states.dtype, device=key_states.device))
+            keys_to_return, values_to_return = key_states, value_states
+        else:
+            dequant_key = self._dequantize(self._quantized_key_cache[layer_idx])
+            dequant_value = self._dequantize(self._quantized_value_cache[layer_idx])
+            keys_to_return = [dequant_key, self.key_cache[layer_idx], key_states]
+            values_to_return = [dequant_value, self.value_cache[layer_idx], value_states]
+
+            keys_to_return = torch.cat(keys_to_return, dim=-2)
+            values_to_return = torch.cat(values_to_return, dim=-2)
+            if (
+                self.key_cache[layer_idx].dim() == 4
+                and self.key_cache[layer_idx].shape[-2] + 1 >= self.residual_length
+            ):
+                self._quantized_key_cache[layer_idx] = self._quantize(keys_to_return.contiguous(), axis=self.axis_key)
+                self._quantized_value_cache[layer_idx] = self._quantize(
+                    values_to_return.contiguous(), axis=self.axis_value
+                )
+                self.key_cache[layer_idx] = torch.zeros(0, dtype=key_states.dtype, device=key_states.device)
+                self.value_cache[layer_idx] = torch.zeros(0, dtype=key_states.dtype, device=key_states.device)
+            else:
+                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+
+        return keys_to_return, values_to_return
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        # since we cannot get the seq_length of each layer directly and rely on `_seen_tokens` which is
+        # updated every "layer_idx" == 0, this is a hack to get the actual seq_length for the given layer_idx
+        # this part of code otherwise fails when used to verify attn_weight shape in some models
+        return self._seen_tokens if layer_idx == 0 else self._seen_tokens - 1
+
+    def _quantize(self, tensor, axis):
+        """Quantizes a key/value using a defined quantization method."""
+        raise NotImplementedError("Make sure to implement `_quantize` in a subclass.")
+
+    def _dequantize(self, q_tensor):
+        """Dequantizes back the tensor that was quantized by `self._quantize()`"""
+        raise NotImplementedError("Make sure to implement `_dequantize` in a subclass.")
+
+
+class QuantoQuantizedCache(QuantizedCache):
+    """
+    Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.
+
+    Parameters:
+        cache_config (`QuantizedCacheConfig`,):
+            A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
+    """
+
+    def __init__(self, cache_config: CacheConfig) -> None:
+        super().__init__(cache_config)
+        if self.nbits not in [2, 4]:
+            raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}")
+
+        if self.axis_key not in [0, -1]:
+            raise ValueError(f"`axis_key` for `quanto` backend has to be one of [`0`, `-1`] but got {self.axis_key}")
+
+        if self.axis_value not in [0, -1]:
+            raise ValueError(
+                f"`axis_value` for `quanto` backend has to be one of [`0`, `-1`] but got {self.axis_value}"
+            )
+
+        self.qtype = qint4 if self.nbits == 4 else qint2
+
+    def _quantize(self, tensor, axis):
+        qtensor = QBitsTensor.quantize(tensor, axis=axis, qtype=self.qtype, group_size=self.q_group_size)
+        return qtensor
+
+    def _dequantize(self, qtensor):
+        return qtensor.dequantize()
+
+
+class HQQQuantizedCache(QuantizedCache):
+    """
+    Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.
+
+    Parameters:
+        cache_config (`QuantizedCacheConfig`,):
+            A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
+    """
+
+    def __init__(self, cache_config: CacheConfig) -> None:
+        super().__init__(cache_config)
+        if self.nbits not in [1, 2, 3, 4, 8]:
+            raise ValueError(
+                f"`nbits` for `HQQ` backend has to be one of [`1`, `2`, `3`, `4`, `8`] but got {self.nbits}"
+            )
+
+        if self.axis_key not in [0, 1]:
+            raise ValueError(f"`axis_key` for `HQQ` backend has to be one of [`0`, `1`] but got {self.axis_key}")
+
+        if self.axis_value not in [0, 1]:
+            raise ValueError(f"`axis_value` for `HQQ` backend has to be one of [`0`, `1`] but got {self.axis_value}")
+
+        self.quantizer = HQQQuantizer
+
+    def _quantize(self, tensor, axis):
+        qtensor, meta = self.quantizer.quantize(
+            tensor,
+            axis=axis,
+            device=self.device,
+            compute_dtype=self.compute_dtype,
+            nbits=self.nbits,
+            group_size=self.q_group_size,
+        )
+        meta["compute_dtype"] = self.compute_dtype
+        self.quantizer.cuda(qtensor, meta=meta, device=self.device)  # Move to device and cast to dtype
+        return qtensor, meta
+
+    def _dequantize(self, qtensor):
+        quant_tensor, meta = qtensor
+        tensor = self.quantizer.dequantize(quant_tensor, meta)
+        return tensor
+
+
 class SinkCache(Cache):
     """
     A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index eb14c60d9a..0d1eba0bd5 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -31,6 +31,7 @@ from ..utils import (
     download_url,
     extract_commit_hash,
     is_remote_url,
+    is_torch_available,
     logging,
 )
 
@@ -41,6 +42,12 @@ if TYPE_CHECKING:
 
 logger = logging.get_logger(__name__)
 METADATA_FIELDS = ("_from_model_config", "_commit_hash", "_original_object_hash", "transformers_version")
+NEEDS_CACHE_CONFIG = {}
+
+if is_torch_available():
+    from ..cache_utils import QuantizedCacheConfig
+
+    NEEDS_CACHE_CONFIG["quantized"] = QuantizedCacheConfig
 
 
 class GenerationMode(ExplicitEnum):
@@ -299,6 +306,10 @@ class GenerationConfig(PushToHubMixin):
 
         cache_implementation (`str`, *optional*, default to `None`):
             Cache class that should be used when generating.
+        cache_config (`Union[CacheConfig, dict]`, *optional*, default to `None`):
+            Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
+            it will be converted to its repsective `CacheConfig` internally.
+            Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
 
         > Wild card
 
@@ -382,6 +393,13 @@ class GenerationConfig(PushToHubMixin):
 
         # Cache implementation
         self.cache_implementation = kwargs.pop("cache_implementation", None)
+        self.cache_config = kwargs.pop("cache_config", None)
+        if self.cache_implementation is not None:
+            cache_config_class = NEEDS_CACHE_CONFIG[self.cache_implementation]
+            if self.cache_config is None:
+                self.cache_config = cache_config_class()
+            elif isinstance(self.cache_config, dict):
+                self.cache_config = cache_config_class.from_dict(self.cache_config)
 
         # Prompt lookup decoding
         self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
@@ -638,13 +656,26 @@ class GenerationConfig(PushToHubMixin):
                     f"({self.num_beams})."
                 )
 
-        # check watermarking arguments
+        # 5. check `cache_config`
+        if self.cache_config is not None:
+            cache_class = NEEDS_CACHE_CONFIG.get(self.cache_implementation)
+            if cache_class is None:
+                raise ValueError(
+                    "You provided a `cache_config` but the cache implementation you are using "
+                    f"({self.cache_implementation}) does not require any config. Make sure to use the "
+                    "correct cache implementation matching your cache config."
+                )
+            if not isinstance(self.cache_config, cache_class):
+                self.cache_config = cache_class.from_dict(self.cache_config)
+            self.cache_config.validate()
+
+        # 6.  check watermarking arguments
         if self.watermarking_config is not None:
             if not isinstance(self.watermarking_config, WatermarkingConfig):
                 self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config)
             self.watermarking_config.validate()
 
-        # 5. check common issue: passing `generate` arguments inside the generation config
+        # 7. check common issue: passing `generate` arguments inside the generation config
         generate_arguments = (
             "logits_processor",
             "stopping_criteria",
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 149ce144e6..84c9dd995e 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -24,7 +24,15 @@ import torch
 import torch.distributed as dist
 from torch import nn
 
-from ..cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from ..cache_utils import (
+    Cache,
+    DynamicCache,
+    HQQQuantizedCache,
+    QuantizedCacheConfig,
+    QuantoQuantizedCache,
+    SlidingWindowCache,
+    StaticCache,
+)
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..models.auto import (
@@ -34,7 +42,14 @@ from ..models.auto import (
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
     MODEL_FOR_VISION_2_SEQ_MAPPING,
 )
-from ..utils import ModelOutput, is_accelerate_available, is_torchdynamo_compiling, logging
+from ..utils import (
+    ModelOutput,
+    is_accelerate_available,
+    is_hqq_available,
+    is_quanto_available,
+    is_torchdynamo_compiling,
+    logging,
+)
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
 from .candidate_generator import (
@@ -97,6 +112,7 @@ if is_accelerate_available():
     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
 
 NEED_SETUP_CACHE_CLASSES_MAPPING = {"static": StaticCache, "sliding_window": SlidingWindowCache}
+QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache}
 
 
 @dataclass
@@ -1658,20 +1674,43 @@ class GenerationMixin:
                 "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a "
                 "Cache object) is unsupported. Please use only one of the two."
             )
-        elif generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
-            if not self._supports_cache_class:
-                raise ValueError(
-                    "This model does not support the `cache_implementation` argument. Please check the following "
-                    "issue: https://github.com/huggingface/transformers/issues/28981."
+        elif generation_config.cache_implementation is not None:
+            if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
+                if generation_config.cache_implementation == "static" and not self._supports_static_cache:
+                    raise ValueError(
+                        "This model does not support `cache_implementation='static'`. Please check the following "
+                        "issue: https://github.com/huggingface/transformers/issues/28981"
+                    )
+                model_kwargs["past_key_values"] = self._get_cache(
+                    generation_config.cache_implementation, batch_size, generation_config.max_length
                 )
-            if generation_config.cache_implementation == "static" and not self._supports_static_cache:
-                raise ValueError(
-                    "This model does not support `cache_implementation='static'`. Please check the following "
-                    "issue: https://github.com/huggingface/transformers/issues/28981"
+            elif generation_config.cache_implementation == "quantized":
+                if not self._supports_quantized_cache:
+                    raise ValueError(
+                        "This model does not support the quantized cache. If you want your model to support quantized "
+                        "cache, please open an issue."
+                    )
+
+                cache_config = (
+                    generation_config.cache_config
+                    if generation_config.cache_config is not None
+                    else QuantizedCacheConfig()
                 )
-            model_kwargs["past_key_values"] = self._get_cache(
-                generation_config.cache_implementation, batch_size, generation_config.max_length
-            )
+                cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]
+
+                if cache_config.backend == "quanto" and not is_quanto_available():
+                    raise ImportError(
+                        "You need to install `quanto` in order to use KV cache quantization with quanto backend. "
+                        "Please install it via  with `pip install quanto`"
+                    )
+                elif cache_config.backend == "HQQ" and not is_hqq_available():
+                    raise ImportError(
+                        "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
+                        "Please install it via  with `pip install hqq`"
+                    )
+
+                model_kwargs["past_key_values"] = cache_class(cache_config)
+
         self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
 
         # 7. determine generation mode
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 106f79ae8e..354962bab0 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1284,6 +1284,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     _supports_cache_class = False
     _supports_static_cache = False
 
+    # Has support for a `QuantoQuantizedCache` instance as `past_key_values`
+    _supports_quantized_cache = False
+
     @property
     def dummy_inputs(self) -> Dict[str, torch.Tensor]:
         """
diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
index 41c4e151a3..7d1b0e19fc 100644
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@@ -712,6 +712,7 @@ class CoherePreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
+    _supports_quantized_cache = True
     _supports_static_cache = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 850e3a3f81..67f4b819e9 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -937,6 +937,7 @@ class DbrxPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
+    _supports_quantized_cache = True
     _supports_static_cache = True
 
     def _init_weights(self, module: nn.Module):
diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
index 565a976fd7..474dccf308 100644
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@@ -698,6 +698,7 @@ class GemmaPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
+    _supports_quantized_cache = True
     _supports_static_cache = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index 1f4f6ac9a0..226d14c18b 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -767,6 +767,7 @@ class LlamaPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
+    _supports_quantized_cache = True
     _supports_static_cache = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
index 9b4b08239b..1630297cd8 100644
--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@@ -745,6 +745,7 @@ class OlmoPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
+    _supports_quantized_cache = True
     _supports_static_cache = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
index c48132c83c..ab9f8c3d85 100644
--- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
+++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py
@@ -539,6 +539,8 @@ class RecurrentGemmaPreTrainedModel(PreTrainedModel):
     _skip_keys_device_placement = ["cache"]
     _supports_flash_attn_2 = False
     _supports_sdpa = False  # we can't compare with eager for now
+    _supports_cache_class = True
+    _supports_quantized_cache = True
 
     def _init_weights(self, module):
         std = math.sqrt(self.config.w_init_variance_scale / self.config.conv1d_width)
diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py
index df4e922c5a..160d70fe61 100755
--- a/src/transformers/models/stablelm/modeling_stablelm.py
+++ b/src/transformers/models/stablelm/modeling_stablelm.py
@@ -832,6 +832,7 @@ class StableLmPreTrainedModel(PreTrainedModel):
     _supports_flash_attn_2 = True
     _supports_cache_class = True
     _supports_sdpa = True
+    _supports_quantized_cache = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index a42d7b7dec..f30cfe1947 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -784,11 +784,11 @@ class WhisperGenerationMixin:
                     del generate_kwargs[key]
             seek_outputs = super().generate(
                 segment_input,
-                generation_config,
-                logits_processor,
-                stopping_criteria,
-                prefix_allowed_tokens_fn,
-                synced_gpus,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
                 decoder_input_ids=decoder_input_ids,
                 **generate_kwargs,
             )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 5e00230aed..5ac2a2ccbd 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -23,6 +23,13 @@ class Cache(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class CacheConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class DynamicCache(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -30,6 +37,34 @@ class DynamicCache(metaclass=DummyObject):
         requires_backends(self, ["torch"])
 
 
+class HQQQuantizedCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QuantizedCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QuantizedCacheConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QuantoQuantizedCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class SinkCache(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 840b64e17d..7d654312a3 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -27,6 +27,7 @@ from transformers import is_torch_available, pipeline, set_seed
 from transformers.testing_utils import (
     is_flaky,
     require_accelerate,
+    require_quanto,
     require_torch,
     require_torch_multi_accelerator,
     slow,
@@ -55,7 +56,7 @@ if is_torch_available():
         ImageGPTForCausalImageModeling,
         SpeechEncoderDecoderModel,
     )
-    from transformers.cache_utils import DynamicCache
+    from transformers.cache_utils import DynamicCache, QuantoQuantizedCache
     from transformers.generation import (
         BeamSampleDecoderOnlyOutput,
         BeamSampleEncoderDecoderOutput,
@@ -1654,6 +1655,39 @@ class GenerationTesterMixin:
                         )
                     )
 
+    @require_quanto
+    def test_generate_with_quant_cache(self):
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_quantized_cache:
+                self.skipTest("This model does not support the quantized cache format")
+
+            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            config.use_cache = True
+            config.is_decoder = True
+
+            model = model_class(config).to(torch_device).eval()
+            generation_kwargs = {
+                "max_new_tokens": 5,
+                "cache_implementation": "quantized",
+                # careful with group size, should be divisor of model's hidden size
+                "cache_config": {"backend": "quanto", "nbits": 2, "q_group_size": 8, "residual_length": 128},
+                "return_dict_in_generate": True,  # Required to return `past_key_values`
+            }
+
+            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            self.assertTrue(isinstance(results.past_key_values, QuantoQuantizedCache))
+
+            # passing past key values of different type should raise Error
+            with self.assertRaises(ValueError):
+                model.generate(
+                    input_ids, attention_mask=attention_mask, past_key_valyes=DynamicCache(), **generation_kwargs
+                )
+
+            # setting incorrect cache_config args should raise an Error, i.e. nbits=60 does not make sense
+            generation_kwargs["cache_config"] = {"nbits": 60, "q_group_size": 8, "residual_length": 128}
+            with self.assertRaises(ValueError):
+                model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+
     def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
         batch_size, seq_length = input_ids.shape
         num_sequences_in_output = batch_size * num_return_sequences
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index 69bf998ace..f574478241 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -17,13 +17,22 @@ import tempfile
 import unittest
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, QuantoConfig
-from transformers.testing_utils import require_accelerate, require_quanto, require_torch_gpu, slow
+from transformers.testing_utils import (
+    require_accelerate,
+    require_quanto,
+    require_read_token,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 from transformers.utils import is_accelerate_available, is_quanto_available, is_torch_available
 
 
 if is_torch_available():
     import torch
 
+    from transformers import LlamaForCausalLM, LlamaTokenizer
+
 if is_accelerate_available():
     from accelerate import init_empty_weights
 
@@ -429,3 +438,28 @@ class QuantoQuantizationActivationTest(unittest.TestCase):
         with self.assertRaises(ValueError) as e:
             AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", quantization_config=quantization_config)
         self.assertIn("We don't support quantizing the activations with transformers library", str(e.exception))
+
+
+@require_torch_gpu
+class QuantoKVCacheQuantizationTest(unittest.TestCase):
+    @slow
+    @require_read_token
+    def test_quantized_cache(self):
+        EXPECTED_TEXT_COMPLETION = [
+            "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory of relativity",
+            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my burgers, my hot dogs, my sandwiches, my chicken, my pizza, my sal",
+        ]
+
+        prompts = [
+            "Simply put, the theory of relativity states that ",
+            "My favorite all time favorite condiment is ketchup.",
+        ]
+        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="</s>", padding_side="left")
+        model = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf", device_map="sequential", torch_dtype=torch.float16
+        )
+        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(torch_device)
+
+        generated_ids = model.generate(**inputs, max_new_tokens=40, do_sample=False, cache_implementation="quantized")
+        text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

From 87a351818e6c680282d86a00b61fe4c55b41477d Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 23 May 2024 14:31:52 +0200
Subject: [PATCH 49/80] Docs / Quantization: refactor quantization
 documentation (#30942)

* refactor quant docs

* delete file

* rename to overview

* fix

* fix table

* fix

* add content

* fix library versions

* fix table

* fix table

* fix table

* fix table

* Apply suggestions from code review

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* replace to quantization_config

* fix aqlm snippet

* add DLAI courses

* fix

* fix table

* fix bulet points

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/en/_toctree.yml                   |  26 +-
 docs/source/en/quantization.md                | 818 ------------------
 docs/source/en/quantization/aqlm.md           |  57 ++
 docs/source/en/quantization/awq.md            | 232 +++++
 docs/source/en/quantization/bitsandbytes.md   | 308 +++++++
 .../contribute.md}                            |   2 +-
 docs/source/en/quantization/eetq.md           |  47 +
 docs/source/en/quantization/gptq.md           | 120 +++
 docs/source/en/quantization/hqq.md            |  69 ++
 docs/source/en/quantization/optimum.md        |  19 +
 docs/source/en/quantization/overview.md       |  58 ++
 docs/source/en/quantization/quanto.md         |  66 ++
 12 files changed, 999 insertions(+), 823 deletions(-)
 delete mode 100755 docs/source/en/quantization.md
 create mode 100644 docs/source/en/quantization/aqlm.md
 create mode 100644 docs/source/en/quantization/awq.md
 create mode 100644 docs/source/en/quantization/bitsandbytes.md
 rename docs/source/en/{hf_quantizer.md => quantization/contribute.md} (97%)
 create mode 100644 docs/source/en/quantization/eetq.md
 create mode 100644 docs/source/en/quantization/gptq.md
 create mode 100644 docs/source/en/quantization/hqq.md
 create mode 100644 docs/source/en/quantization/optimum.md
 create mode 100644 docs/source/en/quantization/overview.md
 create mode 100644 docs/source/en/quantization/quanto.md

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index b7db4a66c6..be3001dc76 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -135,18 +135,36 @@
     title: Community resources
   - local: troubleshooting
     title: Troubleshoot
-  - local: hf_quantizer
-    title: Contribute new quantization method
   - local: gguf
     title: Interoperability with GGUF files
   title: Developer guides
+- sections:
+  - local: quantization/overview
+    title: Getting started
+  - local: quantization/bitsandbytes
+    title: bitsandbytes
+  - local: quantization/gptq
+    title: GPTQ
+  - local: quantization/awq
+    title: AWQ
+  - local: quantization/aqlm
+    title: AQLM
+  - local: quantization/quanto
+    title: Quanto
+  - local: quantization/eetq
+    title: EETQ
+  - local: quantization/hqq
+    title: HQQ
+  - local: quantization/optimum
+    title: Optimum
+  - local: quantization/contribute
+    title: Contribute new quantization method
+  title: Quantization Methods
 - sections:
   - local: performance
     title: Overview
   - local: llm_optims
     title: LLM inference optimization
-  - local: quantization
-    title: Quantization
   - sections:
     - local: perf_train_gpu_one
       title: Methods and tools for efficient training on a single GPU
diff --git a/docs/source/en/quantization.md b/docs/source/en/quantization.md
deleted file mode 100755
index 2264550566..0000000000
--- a/docs/source/en/quantization.md
+++ /dev/null
@@ -1,818 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
--->
-
-# Quantization
-
-Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
-
-Transformers supports several quantization schemes to help you run inference with large language models (LLMs) and finetune adapters on quantized models. This guide will show you how to use Activation-aware Weight Quantization (AWQ), AutoGPTQ, and bitsandbytes.
-
-<Tip>
-
-Interested in adding a new quantization method to Transformers? Read the [HfQuantizer](./hf_quantizer) guide to learn how!
-
-</Tip>
-
-## Quanto
-
-<Tip>
-
-Try Quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
-
-</Tip>
-
-
-[🤗 Quanto](https://github.com/huggingface/quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
-
-- weights quantization (`float8`,`int8`,`int4`,`int2`)
-- activation quantization (`float8`,`int8`)
-- modality agnostic (e.g CV,LLM)
-- device agnostic (e.g CUDA,MPS,CPU)
-- compatibility with `torch.compile`
-- easy to add custom kernel for specific device
-- supports quantization aware training
-<!-- Add link to the blogpost -->
-
-Before you begin, make sure the following libraries are installed:
-
-```bash
-pip install quanto
-pip install git+https://github.com/huggingface/accelerate.git
-pip install git+https://github.com/huggingface/transformers.git
-```
-
-Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 
-
-The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [quanto](https://github.com/huggingface/quanto) library instead. 
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
-
-model_id = "facebook/opt-125m"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-quantization_config = QuantoConfig(weights="int8")
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0", quantization_config=quantization_config)
-```
-
-Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
-
-Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following becnhmark (llama-2-7b on perplexity metric). You can find more benchamarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/NousResearch-Llama-2-7b-hf_Perplexity.png" alt="llama-2-7b-quanto-perplexity" />
-  </div>
-</div>
-
-The library is versatible enough to be compatible with most PTQ optimization algorithms. The plan in the future is to integrate the most popular algorithms in the most seamless possible way (AWQ, Smoothquant).
-
-## AQLM
-
-
-
-Try AQLM on [Google Colab](https://colab.research.google.com/drive/1-xZmBRXT5Fm3Ghn4Mwa2KRypORXb855X?usp=sharing)!
-
-Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and take advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes.
-
-Inference support for AQLM is realised in the `aqlm` library. Make sure to install it to run the models (note aqlm works only with python>=3.10):
-```bash
-pip install aqlm[gpu,cpu]
-```
-
-The library provides efficient kernels for both GPU and CPU inference and training.
-
-The instructions on how to quantize models yourself, as well as all the relevant code can be found in the corresponding GitHub [repository](https://github.com/Vahe1994/AQLM).
-
-### PEFT
-
-Starting with version `aqlm 1.0.2`, AQLM supports Parameter-Efficient Fine-Tuning in a form of [LoRA](https://huggingface.co/docs/peft/package_reference/lora) integrated into the [PEFT](https://huggingface.co/blog/peft) library.
-
-### AQLM configurations
-
-AQLM quantization setups vary mainly on the number of codebooks used as well as codebook sizes in bits. The most popular setups, as well as inference kernels they support are:
- 
-| Kernel | Number of codebooks | Codebook size, bits | Notation | Accuracy | Speedup     | Fast GPU inference | Fast CPU inference |
-|---|---------------------|---------------------|----------|-------------|-------------|--------------------|--------------------|
-| Triton | K                   | N                  | KxN     | -        | Up to ~0.7x | ✅                  | ❌                  |
-| CUDA | 1                   | 16                  | 1x16     | Best        | Up to ~1.3x | ✅                  | ❌                  |
-| CUDA | 2                   | 8                   | 2x8      | OK          | Up to ~3.0x | ✅                  | ❌                  |
-| Numba | K                   | 8                   | Kx8      | Good        | Up to ~4.0x | ❌                  | ✅                  |
-
-## AWQ
-
-<Tip>
-
-Try AWQ quantization with this [notebook](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)!
-
-</Tip>
-
-[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978) doesn't quantize all the weights in a model, and instead, it preserves a small percentage of weights that are important for LLM performance. This significantly reduces quantization loss such that you can run models in 4-bit precision without experiencing any performance degradation.
-
-There are several libraries for quantizing models with the AWQ algorithm, such as [llm-awq](https://github.com/mit-han-lab/llm-awq), [autoawq](https://github.com/casper-hansen/AutoAWQ) or [optimum-intel](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc). Transformers supports loading models quantized with the llm-awq and autoawq libraries. This guide will show you how to load models quantized with autoawq, but the process is similar for llm-awq quantized models.
-
-Make sure you have autoawq installed:
-
-```bash
-pip install autoawq
-```
-
-AWQ-quantized models can be identified by checking the `quantization_config` attribute in the model's [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file:
-
-```json
-{
-  "_name_or_path": "/workspace/process/huggingfaceh4_zephyr-7b-alpha/source",
-  "architectures": [
-    "MistralForCausalLM"
-  ],
-  ...
-  ...
-  ...
-  "quantization_config": {
-    "quant_method": "awq",
-    "zero_point": true,
-    "group_size": 128,
-    "bits": 4,
-    "version": "gemm"
-  }
-}
-```
-
-A quantized model is loaded with the [`~PreTrainedModel.from_pretrained`] method. If you loaded your model on the CPU, make sure to move it to a GPU device first. Use the `device_map` parameter to specify where to place the model:
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "TheBloke/zephyr-7B-alpha-AWQ"
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
-```
-
-Loading an AWQ-quantized model automatically sets other weights to fp16 by default for performance reasons. If you want to load these other weights in a different format, use the `torch_dtype` parameter:
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "TheBloke/zephyr-7B-alpha-AWQ"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
-```
-
-AWQ quantization can also be combined with [FlashAttention-2](perf_infer_gpu_one#flashattention-2) to further accelerate inference:
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0")
-```
-
-### Fused modules
-
-Fused modules offers improved accuracy and performance and it is supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures.
-
-<Tip warning={true}>
-
-Fused modules cannot be combined with other optimization techniques such as FlashAttention-2.
-
-</Tip>
-
-<hfoptions id="fuse">
-<hfoption id="supported architectures">
-
-To enable fused modules for supported architectures, create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True`. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. You can set it to a larger value to be safe.
-
-For example, to fuse the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model.
-
-```python
-import torch
-from transformers import AwqConfig, AutoModelForCausalLM
-
-model_id = "TheBloke/Mistral-7B-OpenOrca-AWQ"
-
-quantization_config = AwqConfig(
-    bits=4,
-    fuse_max_seq_len=512,
-    do_fuse=True,
-)
-
-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
-```
-
-</hfoption>
-<hfoption id="unsupported architectures">
-
-For architectures that don't support fused modules yet, you need to create a custom fusing mapping to define which modules need to be fused with the `modules_to_fuse` parameter. For example, to fuse the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model.
-
-```python
-import torch
-from transformers import AwqConfig, AutoModelForCausalLM
-
-model_id = "TheBloke/Yi-34B-AWQ"
-
-quantization_config = AwqConfig(
-    bits=4,
-    fuse_max_seq_len=512,
-    modules_to_fuse={
-        "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
-        "layernorm": ["ln1", "ln2", "norm"],
-        "mlp": ["gate_proj", "up_proj", "down_proj"],
-        "use_alibi": False,
-        "num_attention_heads": 56,
-        "num_key_value_heads": 8,
-        "hidden_size": 7168
-    }
-)
-
-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
-```
-
-The parameter `modules_to_fuse` should include:
-
-- `"attention"`: The names of the attention layers to fuse in the following order: query, key, value and output projection layer. If you don't want to fuse these layers, pass an empty list.
-- `"layernorm"`: The names of all the LayerNorm layers you want to replace with a custom fused LayerNorm. If you don't want to fuse these layers, pass an empty list.
-- `"mlp"`: The names of the MLP layers you want to fuse into a single MLP layer in the order: (gate (dense, layer, post-attention) / up / down layers).
-- `"use_alibi"`: If your model uses ALiBi positional embedding.
-- `"num_attention_heads"`: The number of attention heads.
-- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA). If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA), otherwise GQA is used.
-- `"hidden_size"`: The dimension of the hidden representations.
-
-</hfoption>
-</hfoptions>
-
-### Exllama-v2 support
-
-Recent versions of `autoawq` supports exllama-v2 kernels for faster prefill and decoding. To get started, first install the latest version of `autoawq` by running:
-
-```bash
-pip install git+https://github.com/casper-hansen/AutoAWQ.git
-```
-
-Get started by passing an `AwqConfig()` with `version="exllama"`.
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
-
-quantization_config = AwqConfig(version="exllama")
-
-model = AutoModelForCausalLM.from_pretrained(
-    "TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
-    quantization_config=quantization_config,
-    device_map="auto",
-)
-
-input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cuda")
-output = model(input_ids)
-print(output.logits)
-
-tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-AWQ")
-input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(model.device)
-output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=50256)
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-<Tip warning={true}>
-
-Note this feature is supported on AMD GPUs.
-
-</Tip>
-
-
-## AutoGPTQ
-
-<Tip>
-
-Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) and learn more about it's details in this [blog post](https://huggingface.co/blog/gptq-integration)!
-
-</Tip>
-
-The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate.
-
-Before you begin, make sure the following libraries are installed:
-
-```bash
-pip install auto-gptq
-pip install git+https://github.com/huggingface/optimum.git
-pip install git+https://github.com/huggingface/transformers.git
-pip install --upgrade accelerate
-```
-
-To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset.
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
-
-model_id = "facebook/opt-125m"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
-```
-
-You could also pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper.
-
-```py
-dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
-gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
-```
-
-Load a model to quantize and pass the `gptq_config` to the [`~AutoModelForCausalLM.from_pretrained`] method. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization.
-
-```py
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
-```
-
-If you're running out of memory because a dataset is too large, disk offloading is not supported. If this is the case, try passing the `max_memory` parameter to allocate the amount of memory to use on your device (GPU and CPU):
-
-```py
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"}, quantization_config=gptq_config)
-```
-
-<Tip warning={true}>
-
-Depending on your hardware, it can take some time to quantize a model from scratch. It can take ~5 minutes to quantize the [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists.
-
-</Tip>
-
-Once your model is quantized, you can push the model and tokenizer to the Hub where it can be easily shared and accessed. Use the [`~PreTrainedModel.push_to_hub`] method to save the [`GPTQConfig`]:
-
-```py
-quantized_model.push_to_hub("opt-125m-gptq")
-tokenizer.push_to_hub("opt-125m-gptq")
-```
-
-You could also save your quantized model locally with the [`~PreTrainedModel.save_pretrained`] method. If the model was quantized with the `device_map` parameter, make sure to move the entire model to a GPU or CPU before saving it. For example, to save the model on a CPU:
-
-```py
-quantized_model.save_pretrained("opt-125m-gptq")
-tokenizer.save_pretrained("opt-125m-gptq")
-
-# if quantized with device_map set
-quantized_model.to("cpu")
-quantized_model.save_pretrained("opt-125m-gptq")
-```
-
-Reload a quantized model with the [`~PreTrainedModel.from_pretrained`] method, and set `device_map="auto"` to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed.
-
-```py
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
-```
-
-### ExLlama
-
-[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, GPTQConfig
-
-gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
-```
-
-<Tip warning={true}>
-
-Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT.
-
-</Tip>
-
-The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, GPTQConfig
-gptq_config = GPTQConfig(bits=4, use_exllama=False)
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config)
-```
-
-## bitsandbytes
-
-[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) is the easiest option for quantizing a model to 8 and 4-bit. 8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. This reduces the degradative effect outlier values have on a model's performance. 4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.
-
-To use bitsandbytes, make sure you have the following libraries installed:
-
-<hfoptions id="bnb">
-<hfoption id="8-bit">
-
-```bash
-pip install transformers accelerate bitsandbytes>0.37.0
-```
-
-</hfoption>
-<hfoption id="4-bit">
-
-```bash
-pip install bitsandbytes>=0.39.0
-pip install --upgrade accelerate
-pip install --upgrade transformers
-```
-
-</hfoption>
-</hfoptions>
-
-Now you can quantize a model with the `load_in_8bit` or `load_in_4bit` parameters in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it supports loading with Accelerate and contains `torch.nn.Linear` layers.
-
-<hfoptions id="bnb">
-<hfoption id="8-bit">
-
-Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
-
-```py
-from transformers import AutoModelForCausalLM
-
-model_8bit = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", device_map="auto", load_in_8bit=True)
-```
-
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
-
-```py
-import torch
-from transformers import AutoModelForCausalLM
-
-model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32)
-model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
-```
-
-Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. If you have the latest versions, then you can push the 8-bit model to the Hub with the [`~PreTrainedModel.push_to_hub`] method. The quantization config.json file is pushed first, followed by the quantized model weights.
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
-tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
-
-model.push_to_hub("bloom-560m-8bit")
-```
-
-</hfoption>
-<hfoption id="4-bit">
-
-Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
-
-```py
-from transformers import AutoModelForCausalLM
-
-model_4bit = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", device_map="auto", load_in_4bit=True)
-```
-
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
-
-```py
-import torch
-from transformers import AutoModelForCausalLM
-
-model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True, torch_dtype=torch.float32)
-model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
-```
-
-If you have `bitsandbytes>=0.41.3`, you can serialize 4-bit models and push them on Hugging Face Hub. Simply call `model.push_to_hub()` after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with `model.save_pretrained()` command.  
-
-</hfoption>
-</hfoptions>
-
-<Tip warning={true}>
-
-Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
-
-</Tip>
-
-You can check your memory footprint with the `get_memory_footprint` method:
-
-```py
-print(model.get_memory_footprint())
-```
-
-Quantized models can be loaded from the [`~PreTrainedModel.from_pretrained`] method without needing to specify the `load_in_8bit` or `load_in_4bit` parameters:
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
-```
-
-### 8-bit
-
-<Tip>
-
-Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
-
-</Tip>
-
-This section explores some of the specific features of 8-bit models, such as offloading, outlier thresholds, skipping module conversion, and finetuning.
-
-#### Offloading
-
-8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. The weights dispatched to the CPU are actually stored in **float32**, and aren't converted to 8-bit. For example, to enable offloading for the [bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7) model, start by creating a [`BitsAndBytesConfig`]:
-
-```py
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
-```
-
-Design a custom device map to fit everything on your GPU except for the `lm_head`, which you'll dispatch to the CPU:
-
-```py
-device_map = {
-    "transformer.word_embeddings": 0,
-    "transformer.word_embeddings_layernorm": 0,
-    "lm_head": "cpu",
-    "transformer.h": 0,
-    "transformer.ln_f": 0,
-}
-```
-
-Now load your model with the custom `device_map` and `quantization_config`:
-
-```py
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    "bigscience/bloom-1b7",
-    device_map=device_map,
-    quantization_config=quantization_config,
-)
-```
-
-#### Outlier threshold
-
-An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).
-
-To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:
-
-```py
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig
-
-model_id = "bigscience/bloom-1b7"
-
-quantization_config = BitsAndBytesConfig(
-    llm_int8_threshold=10,
-)
-
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map=device_map,
-    quantization_config=quantization_config,
-)
-```
-
-#### Skip module conversion
-
-For some models, like [Jukebox](model_doc/jukebox), you don't need to quantize every module to 8-bit which can actually cause instability. With Jukebox, there are several `lm_head` modules that should be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]:
-
-```py
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-model_id = "bigscience/bloom-1b7"
-
-quantization_config = BitsAndBytesConfig(
-    llm_int8_skip_modules=["lm_head"],
-)
-
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    quantization_config=quantization_config,
-)
-```
-
-#### Finetuning
-
-With the [PEFT](https://github.com/huggingface/peft) library, you can finetune large models like [flan-t5-large](https://huggingface.co/google/flan-t5-large) and [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) with 8-bit quantization. You don't need to pass the `device_map` parameter for training because it'll automatically load your model on a GPU. However, you can still customize the device map with the `device_map` parameter if you want to (`device_map="auto"` should only be used for inference).
-
-### 4-bit
-
-<Tip>
-
-Try 4-bit quantization in this [notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) and learn more about it's details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
-
-</Tip>
-
-This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization.
-
-
-#### Compute data type
-
-To speedup computation, you can change the data type from float32 (the default value) to bf16 using the `bnb_4bit_compute_dtype` parameter in [`BitsAndBytesConfig`]:
-
-```py
-import torch
-from transformers import BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
-```
-
-#### Normal Float 4 (NF4)
-
-NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:
-
-```py
-from transformers import BitsAndBytesConfig
-
-nf4_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-)
-
-model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
-```
-
-For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
-
-#### Nested quantization
-
-Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an addition 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.
-
-```py
-from transformers import BitsAndBytesConfig
-
-double_quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config)
-```
-
-### Dequantizing `bitsandbytes` models
-
-Once quantized, you can dequantize the model to the original precision. Note this might result in a small quality loss of the model. Make also sure to have enough GPU RAM to fit the dequantized model. 
-Below is how to perform dequantization on a 4-bit model using `bitsandbytes`.
-
-```python
-from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
-
-model_id = "facebook/opt-125m"
-
-model = AutoModelForCausalLM.from_pretrained(model_id, BitsAndBytesConfig(load_in_4bit=True))
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-model.dequantize()
-
-text = tokenizer("Hello my name is", return_tensors="pt").to(0)
-
-out = model.generate(**text)
-print(tokenizer.decode(out[0]))
-```
-
-## EETQ
-The [EETQ](https://github.com/NetEase-FuXi/EETQ) library supports int8 per-channel weight-only quantization for NVIDIA GPUS. The high-performance GEMM and GEMV kernels are from FasterTransformer and TensorRT-LLM. It requires no calibration dataset and does not need to pre-quantize your model. Moreover, the accuracy degradation is negligible owing to the per-channel quantization. 
-
-Make sure you have eetq installed from the [relase page](https://github.com/NetEase-FuXi/EETQ/releases)
-```
-pip install --no-cache-dir https://github.com/NetEase-FuXi/EETQ/releases/download/v1.0.0/EETQ-1.0.0+cu121+torch2.1.2-cp310-cp310-linux_x86_64.whl
-```
-or via the source code https://github.com/NetEase-FuXi/EETQ. EETQ requires CUDA capability <= 8.9 and >= 7.0
-```
-git clone https://github.com/NetEase-FuXi/EETQ.git
-cd EETQ/
-git submodule update --init --recursive
-pip install .
-```
-
-An unquantized model can be quantized via "from_pretrained".
-```py
-from transformers import AutoModelForCausalLM, EetqConfig
-path = "/path/to/model"
-quantization_config = EetqConfig("int8")
-model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", quantization_config=quantization_config)
-```
-
-A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained".
-
-```py
-quant_path = "/path/to/save/quantized/model"
-model.save_pretrained(quant_path)
-model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto")
-```
-
-## Optimum
-
-The [Optimum](https://huggingface.co/docs/optimum/index) library supports quantization for Intel, Furiosa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. Consider using Optimum for quantization if you're using specific and optimized hardware like Intel CPUs, Furiosa NPUs or a model accelerator like ONNX Runtime.
-
-## Benchmarks
-
-To compare the speed, throughput, and latency of each quantization scheme, check the following benchmarks obtained from the [optimum-benchmark](https://github.com/huggingface/optimum-benchmark) library. The benchmark was run on a NVIDIA A1000 for the [TheBloke/Mistral-7B-v0.1-AWQ](https://huggingface.co/TheBloke/Mistral-7B-v0.1-AWQ) and [TheBloke/Mistral-7B-v0.1-GPTQ](https://huggingface.co/TheBloke/Mistral-7B-v0.1-GPTQ) models. These were also tested against the bitsandbytes quantization methods as well as a native fp16 model.
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_memory_plot.png" alt="forward peak memory per batch size" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">forward peak memory/batch size</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_memory_plot.png" alt="generate peak memory per batch size" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generate peak memory/batch size</figcaption>
-  </div>
-</div>
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_throughput_plot.png" alt="generate throughput per batch size" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_latency_plot.png" alt="forward latency per batch size" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">forward latency/batch size</figcaption>
-  </div>
-</div>
-
-The benchmarks indicate AWQ quantization is the fastest for inference, text generation, and has the lowest peak memory for text generation. However, AWQ has the largest forward latency per batch size. For a more detailed discussion about the pros and cons of each quantization method, read the [Overview of natively supported quantization schemes in 🤗 Transformers](https://huggingface.co/blog/overview-quantization-transformers) blog post.
-
-### Fused AWQ modules
-
-The [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model was benchmarked with `batch_size=1` with and without fused modules.
-
-<figcaption class="text-center text-gray-500 text-lg">Unfused module</figcaption>
-
-|   Batch Size |   Prefill Length |   Decode Length |   Prefill tokens/s |   Decode tokens/s | Memory (VRAM)   |
-|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
-|            1 |               32 |              32 |            60.0984 |           38.4537 | 4.50 GB (5.68%) |
-|            1 |               64 |              64 |          1333.67   |           31.6604 | 4.50 GB (5.68%) |
-|            1 |              128 |             128 |          2434.06   |           31.6272 | 4.50 GB (5.68%) |
-|            1 |              256 |             256 |          3072.26   |           38.1731 | 4.50 GB (5.68%) |
-|            1 |              512 |             512 |          3184.74   |           31.6819 | 4.59 GB (5.80%) |
-|            1 |             1024 |            1024 |          3148.18   |           36.8031 | 4.81 GB (6.07%) |
-|            1 |             2048 |            2048 |          2927.33   |           35.2676 | 5.73 GB (7.23%) |
-
-<figcaption class="text-center text-gray-500 text-lg">Fused module</figcaption>
-
-|   Batch Size |   Prefill Length |   Decode Length |   Prefill tokens/s |   Decode tokens/s | Memory (VRAM)   |
-|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
-|            1 |               32 |              32 |            81.4899 |           80.2569 | 4.00 GB (5.05%) |
-|            1 |               64 |              64 |          1756.1    |          106.26   | 4.00 GB (5.05%) |
-|            1 |              128 |             128 |          2479.32   |          105.631  | 4.00 GB (5.06%) |
-|            1 |              256 |             256 |          1813.6    |           85.7485 | 4.01 GB (5.06%) |
-|            1 |              512 |             512 |          2848.9    |           97.701  | 4.11 GB (5.19%) |
-|            1 |             1024 |            1024 |          3044.35   |           87.7323 | 4.41 GB (5.57%) |
-|            1 |             2048 |            2048 |          2715.11   |           89.4709 | 5.57 GB (7.04%) |
-
-The speed and throughput of fused and unfused modules were also tested with the [optimum-benchmark](https://github.com/huggingface/optimum-benchmark) library.
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_forward_memory_plot.png" alt="generate throughput per batch size" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">forward peak memory/batch size</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_generate_throughput_plot.png" alt="forward latency per batch size" />
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
-  </div>
-</div>
-
-## HQQ 
-Half-Quadratic Quantization (HQQ) implements on-the-fly quantization via fast robust optimization. It doesn't require calibration data and can be used to quantize any model.  
-Please refer to the <a href="https://github.com/mobiusml/hqq/">official package</a> for more details.
-
-For installation, we recommend you use the following approach to get the latest version and build its corresponding CUDA kernels:
-```
-pip install hqq
-```
-
-To quantize a model, you need to create an [`HqqConfig`]. There are two ways of doing it:
-``` Python
-from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
-
-# Method 1: all linear layers will use the same quantization config
-quant_config  = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) #axis=0 is used by default
-```
-
-``` Python
-# Method 2: each linear layer with the same tag will use a dedicated quantization config
-q4_config = {'nbits':4, 'group_size':64, 'quant_zero':False, 'quant_scale':False}
-q3_config = {'nbits':3, 'group_size':32, 'quant_zero':False, 'quant_scale':False}
-quant_config  = HqqConfig(dynamic_config={
-  'self_attn.q_proj':q4_config,
-  'self_attn.k_proj':q4_config,
-  'self_attn.v_proj':q4_config,
-  'self_attn.o_proj':q4_config,
-
-  'mlp.gate_proj':q3_config,
-  'mlp.up_proj'  :q3_config,
-  'mlp.down_proj':q3_config,
-})
-```
-
-The second approach is especially interesting for quantizing Mixture-of-Experts (MoEs) because the experts are less affected by lower quantization settings.
-
-
-Then you simply quantize the model as follows
-``` Python
-model = transformers.AutoModelForCausalLM.from_pretrained(
-    model_id, 
-    torch_dtype=torch.float16, 
-    device_map="cuda", 
-    quantization_config=quant_config
-)
-```
-### Optimized Runtime
-HQQ supports various backends, including pure Pytorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training.
-For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090.
-For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend
diff --git a/docs/source/en/quantization/aqlm.md b/docs/source/en/quantization/aqlm.md
new file mode 100644
index 0000000000..d18f20e0c1
--- /dev/null
+++ b/docs/source/en/quantization/aqlm.md
@@ -0,0 +1,57 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AQLM
+
+> [!TIP]
+> Try AQLM on [Google Colab](https://colab.research.google.com/drive/1-xZmBRXT5Fm3Ghn4Mwa2KRypORXb855X?usp=sharing)!
+
+Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and take advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes.
+
+Inference support for AQLM is realised in the `aqlm` library. Make sure to install it to run the models (note aqlm works only with python>=3.10):
+```bash
+pip install aqlm[gpu,cpu]
+```
+
+The library provides efficient kernels for both GPU and CPU inference and training.
+
+The instructions on how to quantize models yourself, as well as all the relevant code can be found in the corresponding GitHub [repository](https://github.com/Vahe1994/AQLM). To run AQLM models simply load a model that has been quantized with AQLM:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
+    torch_dtype="auto", 
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf")
+```
+
+## PEFT
+
+Starting with version `aqlm 1.0.2`, AQLM supports Parameter-Efficient Fine-Tuning in a form of [LoRA](https://huggingface.co/docs/peft/package_reference/lora) integrated into the [PEFT](https://huggingface.co/blog/peft) library.
+
+## AQLM configurations
+
+AQLM quantization setups vary mainly on the number of codebooks used as well as codebook sizes in bits. The most popular setups, as well as inference kernels they support are:
+ 
+| Kernel | Number of codebooks | Codebook size, bits | Notation | Accuracy | Speedup     | Fast GPU inference | Fast CPU inference |
+|---|---------------------|---------------------|----------|-------------|-------------|--------------------|--------------------|
+| Triton | K                   | N                  | KxN     | -        | Up to ~0.7x | ✅                  | ❌                  |
+| CUDA | 1                   | 16                  | 1x16     | Best        | Up to ~1.3x | ✅                  | ❌                  |
+| CUDA | 2                   | 8                   | 2x8      | OK          | Up to ~3.0x | ✅                  | ❌                  |
+| Numba | K                   | 8                   | Kx8      | Good        | Up to ~4.0x | ❌                  | ✅                  |
diff --git a/docs/source/en/quantization/awq.md b/docs/source/en/quantization/awq.md
new file mode 100644
index 0000000000..c93ec4ba23
--- /dev/null
+++ b/docs/source/en/quantization/awq.md
@@ -0,0 +1,232 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AWQ
+
+<Tip>
+
+Try AWQ quantization with this [notebook](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)!
+
+</Tip>
+
+[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978) doesn't quantize all the weights in a model, and instead, it preserves a small percentage of weights that are important for LLM performance. This significantly reduces quantization loss such that you can run models in 4-bit precision without experiencing any performance degradation.
+
+There are several libraries for quantizing models with the AWQ algorithm, such as [llm-awq](https://github.com/mit-han-lab/llm-awq), [autoawq](https://github.com/casper-hansen/AutoAWQ) or [optimum-intel](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc). Transformers supports loading models quantized with the llm-awq and autoawq libraries. This guide will show you how to load models quantized with autoawq, but the process is similar for llm-awq quantized models.
+
+Make sure you have autoawq installed:
+
+```bash
+pip install autoawq
+```
+
+AWQ-quantized models can be identified by checking the `quantization_config` attribute in the model's [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file:
+
+```json
+{
+  "_name_or_path": "/workspace/process/huggingfaceh4_zephyr-7b-alpha/source",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  ...
+  ...
+  ...
+  "quantization_config": {
+    "quant_method": "awq",
+    "zero_point": true,
+    "group_size": 128,
+    "bits": 4,
+    "version": "gemm"
+  }
+}
+```
+
+A quantized model is loaded with the [`~PreTrainedModel.from_pretrained`] method. If you loaded your model on the CPU, make sure to move it to a GPU device first. Use the `device_map` parameter to specify where to place the model:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
+```
+
+Loading an AWQ-quantized model automatically sets other weights to fp16 by default for performance reasons. If you want to load these other weights in a different format, use the `torch_dtype` parameter:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+```
+
+AWQ quantization can also be combined with [FlashAttention-2](perf_infer_gpu_one#flashattention-2) to further accelerate inference:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0")
+```
+
+## Fused modules
+
+Fused modules offers improved accuracy and performance and it is supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures.
+
+<Tip warning={true}>
+
+Fused modules cannot be combined with other optimization techniques such as FlashAttention-2.
+
+</Tip>
+
+<hfoptions id="fuse">
+<hfoption id="supported architectures">
+
+To enable fused modules for supported architectures, create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True`. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. You can set it to a larger value to be safe.
+
+For example, to fuse the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model.
+
+```python
+import torch
+from transformers import AwqConfig, AutoModelForCausalLM
+
+model_id = "TheBloke/Mistral-7B-OpenOrca-AWQ"
+
+quantization_config = AwqConfig(
+    bits=4,
+    fuse_max_seq_len=512,
+    do_fuse=True,
+)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
+```
+
+The [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model was benchmarked with `batch_size=1` with and without fused modules.
+
+<figcaption class="text-center text-gray-500 text-lg">Unfused module</figcaption>
+
+|   Batch Size |   Prefill Length |   Decode Length |   Prefill tokens/s |   Decode tokens/s | Memory (VRAM)   |
+|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
+|            1 |               32 |              32 |            60.0984 |           38.4537 | 4.50 GB (5.68%) |
+|            1 |               64 |              64 |          1333.67   |           31.6604 | 4.50 GB (5.68%) |
+|            1 |              128 |             128 |          2434.06   |           31.6272 | 4.50 GB (5.68%) |
+|            1 |              256 |             256 |          3072.26   |           38.1731 | 4.50 GB (5.68%) |
+|            1 |              512 |             512 |          3184.74   |           31.6819 | 4.59 GB (5.80%) |
+|            1 |             1024 |            1024 |          3148.18   |           36.8031 | 4.81 GB (6.07%) |
+|            1 |             2048 |            2048 |          2927.33   |           35.2676 | 5.73 GB (7.23%) |
+
+<figcaption class="text-center text-gray-500 text-lg">Fused module</figcaption>
+
+|   Batch Size |   Prefill Length |   Decode Length |   Prefill tokens/s |   Decode tokens/s | Memory (VRAM)   |
+|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
+|            1 |               32 |              32 |            81.4899 |           80.2569 | 4.00 GB (5.05%) |
+|            1 |               64 |              64 |          1756.1    |          106.26   | 4.00 GB (5.05%) |
+|            1 |              128 |             128 |          2479.32   |          105.631  | 4.00 GB (5.06%) |
+|            1 |              256 |             256 |          1813.6    |           85.7485 | 4.01 GB (5.06%) |
+|            1 |              512 |             512 |          2848.9    |           97.701  | 4.11 GB (5.19%) |
+|            1 |             1024 |            1024 |          3044.35   |           87.7323 | 4.41 GB (5.57%) |
+|            1 |             2048 |            2048 |          2715.11   |           89.4709 | 5.57 GB (7.04%) |
+
+The speed and throughput of fused and unfused modules were also tested with the [optimum-benchmark](https://github.com/huggingface/optimum-benchmark) library.
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_forward_memory_plot.png" alt="generate throughput per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">forward peak memory/batch size</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_generate_throughput_plot.png" alt="forward latency per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
+  </div>
+</div>
+
+</hfoption>
+<hfoption id="unsupported architectures">
+
+For architectures that don't support fused modules yet, you need to create a custom fusing mapping to define which modules need to be fused with the `modules_to_fuse` parameter. For example, to fuse the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model.
+
+```python
+import torch
+from transformers import AwqConfig, AutoModelForCausalLM
+
+model_id = "TheBloke/Yi-34B-AWQ"
+
+quantization_config = AwqConfig(
+    bits=4,
+    fuse_max_seq_len=512,
+    modules_to_fuse={
+        "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+        "layernorm": ["ln1", "ln2", "norm"],
+        "mlp": ["gate_proj", "up_proj", "down_proj"],
+        "use_alibi": False,
+        "num_attention_heads": 56,
+        "num_key_value_heads": 8,
+        "hidden_size": 7168
+    }
+)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
+```
+
+The parameter `modules_to_fuse` should include:
+
+- `"attention"`: The names of the attention layers to fuse in the following order: query, key, value and output projection layer. If you don't want to fuse these layers, pass an empty list.
+- `"layernorm"`: The names of all the LayerNorm layers you want to replace with a custom fused LayerNorm. If you don't want to fuse these layers, pass an empty list.
+- `"mlp"`: The names of the MLP layers you want to fuse into a single MLP layer in the order: (gate (dense, layer, post-attention) / up / down layers).
+- `"use_alibi"`: If your model uses ALiBi positional embedding.
+- `"num_attention_heads"`: The number of attention heads.
+- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA). If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA), otherwise GQA is used.
+- `"hidden_size"`: The dimension of the hidden representations.
+
+</hfoption>
+</hfoptions>
+
+
+
+## ExLlama-v2 support
+
+Recent versions of `autoawq` supports ExLlama-v2 kernels for faster prefill and decoding. To get started, first install the latest version of `autoawq` by running:
+
+```bash
+pip install git+https://github.com/casper-hansen/AutoAWQ.git
+```
+
+Get started by passing an `AwqConfig()` with `version="exllama"`.
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
+
+quantization_config = AwqConfig(version="exllama")
+
+model = AutoModelForCausalLM.from_pretrained(
+    "TheBloke/Mistral-7B-Instruct-v0.1-AWQ",
+    quantization_config=quantization_config,
+    device_map="auto",
+)
+
+input_ids = torch.randint(0, 100, (1, 128), dtype=torch.long, device="cuda")
+output = model(input_ids)
+print(output.logits)
+
+tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-AWQ")
+input_ids = tokenizer.encode("How to make a cake", return_tensors="pt").to(model.device)
+output = model.generate(input_ids, do_sample=True, max_length=50, pad_token_id=50256)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+<Tip warning={true}>
+
+Note this feature is supported on AMD GPUs.
+
+</Tip>
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
new file mode 100644
index 0000000000..1d4b4b6013
--- /dev/null
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -0,0 +1,308 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# bitsandbytes
+
+[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) is the easiest option for quantizing a model to 8 and 4-bit. 8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. This reduces the degradative effect outlier values have on a model's performance. 4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.
+
+To use bitsandbytes, make sure you have the following libraries installed:
+
+<hfoptions id="bnb">
+<hfoption id="8-bit">
+
+```bash
+pip install transformers accelerate bitsandbytes>0.37.0
+```
+
+</hfoption>
+<hfoption id="4-bit">
+
+```bash
+pip install bitsandbytes>=0.39.0
+pip install --upgrade accelerate transformers
+```
+
+</hfoption>
+</hfoptions>
+
+Now you can quantize a model by passing a `BitsAndBytesConfig` to [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it supports loading with Accelerate and contains `torch.nn.Linear` layers.
+
+<hfoptions id="bnb">
+<hfoption id="8-bit">
+
+Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
+
+```py
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-1b7", 
+    quantization_config=quantization_config
+)
+```
+
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "facebook/opt-350m", 
+    quantization_config=quantization_config, 
+    torch_dtype=torch.float32
+)
+model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+```
+
+Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. If you have the latest versions, then you can push the 8-bit model to the Hub with the [`~PreTrainedModel.push_to_hub`] method. The quantization config.json file is pushed first, followed by the quantized model weights.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-560m", 
+    quantization_config=quantization_config
+)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+
+model.push_to_hub("bloom-560m-8bit")
+```
+
+</hfoption>
+<hfoption id="4-bit">
+
+Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
+
+```py
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+
+model_4bit = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-1b7",
+    quantization_config=quantization_config
+)
+```
+
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+
+model_4bit = AutoModelForCausalLM.from_pretrained(
+    "facebook/opt-350m",
+    quantization_config=quantization_config, 
+    torch_dtype=torch.float32
+)
+model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+```
+
+If you have `bitsandbytes>=0.41.3`, you can serialize 4-bit models and push them on Hugging Face Hub. Simply call `model.push_to_hub()` after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with `model.save_pretrained()` command.  
+
+</hfoption>
+</hfoptions>
+
+<Tip warning={true}>
+
+Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
+
+</Tip>
+
+You can check your memory footprint with the `get_memory_footprint` method:
+
+```py
+print(model.get_memory_footprint())
+```
+
+Quantized models can be loaded from the [`~PreTrainedModel.from_pretrained`] method without needing to specify the `load_in_8bit` or `load_in_4bit` parameters:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
+```
+
+## 8-bit (LLM.int8() algorithm)
+
+<Tip>
+
+Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
+
+</Tip>
+
+This section explores some of the specific features of 8-bit models, such as offloading, outlier thresholds, skipping module conversion, and finetuning.
+
+### Offloading
+
+8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. The weights dispatched to the CPU are actually stored in **float32**, and aren't converted to 8-bit. For example, to enable offloading for the [bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7) model, start by creating a [`BitsAndBytesConfig`]:
+
+```py
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+```
+
+Design a custom device map to fit everything on your GPU except for the `lm_head`, which you'll dispatch to the CPU:
+
+```py
+device_map = {
+    "transformer.word_embeddings": 0,
+    "transformer.word_embeddings_layernorm": 0,
+    "lm_head": "cpu",
+    "transformer.h": 0,
+    "transformer.ln_f": 0,
+}
+```
+
+Now load your model with the custom `device_map` and `quantization_config`:
+
+```py
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-1b7",
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+```
+
+### Outlier threshold
+
+An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).
+
+To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:
+
+```py
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_threshold=10,
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+```
+
+### Skip module conversion
+
+For some models, like [Jukebox](model_doc/jukebox), you don't need to quantize every module to 8-bit which can actually cause instability. With Jukebox, there are several `lm_head` modules that should be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_skip_modules=["lm_head"],
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    quantization_config=quantization_config,
+)
+```
+
+### Finetuning
+
+With the [PEFT](https://github.com/huggingface/peft) library, you can finetune large models like [flan-t5-large](https://huggingface.co/google/flan-t5-large) and [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) with 8-bit quantization. You don't need to pass the `device_map` parameter for training because it'll automatically load your model on a GPU. However, you can still customize the device map with the `device_map` parameter if you want to (`device_map="auto"` should only be used for inference).
+
+## 4-bit (QLoRA algorithm)
+
+<Tip>
+
+Try 4-bit quantization in this [notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) and learn more about it's details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
+
+</Tip>
+
+This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization.
+
+
+### Compute data type
+
+To speedup computation, you can change the data type from float32 (the default value) to bf16 using the `bnb_4bit_compute_dtype` parameter in [`BitsAndBytesConfig`]:
+
+```py
+import torch
+from transformers import BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+```
+
+### Normal Float 4 (NF4)
+
+NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:
+
+```py
+from transformers import BitsAndBytesConfig
+
+nf4_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+)
+
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+```
+
+For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
+
+### Nested quantization
+
+Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an addition 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.
+
+```py
+from transformers import BitsAndBytesConfig
+
+double_quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+)
+
+model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config)
+```
+
+## Dequantizing `bitsandbytes` models
+
+Once quantized, you can dequantize the model to the original precision but this might result in a small quality loss of the model. Make sure you have enough GPU RAM to fit the dequantized model. 
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
+
+model_id = "facebook/opt-125m"
+
+model = AutoModelForCausalLM.from_pretrained(model_id, BitsAndBytesConfig(load_in_4bit=True))
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+model.dequantize()
+
+text = tokenizer("Hello my name is", return_tensors="pt").to(0)
+
+out = model.generate(**text)
+print(tokenizer.decode(out[0]))
+```
\ No newline at end of file
diff --git a/docs/source/en/hf_quantizer.md b/docs/source/en/quantization/contribute.md
similarity index 97%
rename from docs/source/en/hf_quantizer.md
rename to docs/source/en/quantization/contribute.md
index 8261a6bc45..fb7ef69922 100644
--- a/docs/source/en/hf_quantizer.md
+++ b/docs/source/en/quantization/contribute.md
@@ -64,6 +64,6 @@ For some quantization methods, they may require "pre-quantizing" the models thro
 
 6. Write the `_process_model_after_weight_loading` method. This method enables implementing additional features that require manipulating the model after loading the weights.
 
-7. Document everything! Make sure your quantization method is documented in the [`docs/source/en/quantization.md`](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/docs/source/en/quantization.md) file.
+7. Document everything! Make sure your quantization method is documented by adding a new file under `docs/source/en/quantization` and adding a new row in the table in `docs/source/en/quantization/overview.md`.
 
 8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods.
diff --git a/docs/source/en/quantization/eetq.md b/docs/source/en/quantization/eetq.md
new file mode 100644
index 0000000000..b12ea94265
--- /dev/null
+++ b/docs/source/en/quantization/eetq.md
@@ -0,0 +1,47 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# EETQ
+
+The [EETQ](https://github.com/NetEase-FuXi/EETQ) library supports int8 per-channel weight-only quantization for NVIDIA GPUS. The high-performance GEMM and GEMV kernels are from FasterTransformer and TensorRT-LLM. It requires no calibration dataset and does not need to pre-quantize your model. Moreover, the accuracy degradation is negligible owing to the per-channel quantization. 
+
+Make sure you have eetq installed from the [relase page](https://github.com/NetEase-FuXi/EETQ/releases)
+```
+pip install --no-cache-dir https://github.com/NetEase-FuXi/EETQ/releases/download/v1.0.0/EETQ-1.0.0+cu121+torch2.1.2-cp310-cp310-linux_x86_64.whl
+```
+or via the source code https://github.com/NetEase-FuXi/EETQ. EETQ requires CUDA capability <= 8.9 and >= 7.0
+```
+git clone https://github.com/NetEase-FuXi/EETQ.git
+cd EETQ/
+git submodule update --init --recursive
+pip install .
+```
+
+An unquantized model can be quantized via "from_pretrained".
+```py
+from transformers import AutoModelForCausalLM, EetqConfig
+path = "/path/to/model"
+quantization_config = EetqConfig("int8")
+model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", quantization_config=quantization_config)
+```
+
+A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained".
+
+```py
+quant_path = "/path/to/save/quantized/model"
+model.save_pretrained(quant_path)
+model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto")
+```
\ No newline at end of file
diff --git a/docs/source/en/quantization/gptq.md b/docs/source/en/quantization/gptq.md
new file mode 100644
index 0000000000..5713ef4132
--- /dev/null
+++ b/docs/source/en/quantization/gptq.md
@@ -0,0 +1,120 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPTQ
+
+<Tip>
+
+Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) and learn more about it's details in this [blog post](https://huggingface.co/blog/gptq-integration)!
+
+</Tip>
+
+The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate.
+
+Before you begin, make sure the following libraries are installed:
+
+```bash
+pip install auto-gptq
+pip install --upgrade accelerate optimum transformers
+```
+
+To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
+```
+
+You could also pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper.
+
+```py
+dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
+```
+
+Load a model to quantize and pass the `gptq_config` to the [`~AutoModelForCausalLM.from_pretrained`] method. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization.
+
+```py
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+```
+
+If you're running out of memory because a dataset is too large, disk offloading is not supported. If this is the case, try passing the `max_memory` parameter to allocate the amount of memory to use on your device (GPU and CPU):
+
+```py
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"}, quantization_config=gptq_config)
+```
+
+<Tip warning={true}>
+
+Depending on your hardware, it can take some time to quantize a model from scratch. It can take ~5 minutes to quantize the [facebook/opt-350m](https://huggingface.co/facebook/opt-350m) model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists.
+
+</Tip>
+
+Once your model is quantized, you can push the model and tokenizer to the Hub where it can be easily shared and accessed. Use the [`~PreTrainedModel.push_to_hub`] method to save the [`GPTQConfig`]:
+
+```py
+quantized_model.push_to_hub("opt-125m-gptq")
+tokenizer.push_to_hub("opt-125m-gptq")
+```
+
+You could also save your quantized model locally with the [`~PreTrainedModel.save_pretrained`] method. If the model was quantized with the `device_map` parameter, make sure to move the entire model to a GPU or CPU before saving it. For example, to save the model on a CPU:
+
+```py
+quantized_model.save_pretrained("opt-125m-gptq")
+tokenizer.save_pretrained("opt-125m-gptq")
+
+# if quantized with device_map set
+quantized_model.to("cpu")
+quantized_model.save_pretrained("opt-125m-gptq")
+```
+
+Reload a quantized model with the [`~PreTrainedModel.from_pretrained`] method, and set `device_map="auto"` to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
+```
+
+## ExLlama
+
+[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, GPTQConfig
+
+gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
+```
+
+<Tip warning={true}>
+
+Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT.
+
+</Tip>
+
+The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, GPTQConfig
+gptq_config = GPTQConfig(bits=4, use_exllama=False)
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config)
+```
\ No newline at end of file
diff --git a/docs/source/en/quantization/hqq.md b/docs/source/en/quantization/hqq.md
new file mode 100644
index 0000000000..4c83420906
--- /dev/null
+++ b/docs/source/en/quantization/hqq.md
@@ -0,0 +1,69 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# HQQ 
+
+Half-Quadratic Quantization (HQQ) implements on-the-fly quantization via fast robust optimization. It doesn't require calibration data and can be used to quantize any model.  
+Please refer to the <a href="https://github.com/mobiusml/hqq/">official package</a> for more details.
+
+For installation, we recommend you use the following approach to get the latest version and build its corresponding CUDA kernels:
+```
+pip install hqq
+```
+
+To quantize a model, you need to create an [`HqqConfig`]. There are two ways of doing it:
+``` Python
+from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig
+
+# Method 1: all linear layers will use the same quantization config
+quant_config  = HqqConfig(nbits=8, group_size=64, quant_zero=False, quant_scale=False, axis=0) #axis=0 is used by default
+```
+
+``` Python
+# Method 2: each linear layer with the same tag will use a dedicated quantization config
+q4_config = {'nbits':4, 'group_size':64, 'quant_zero':False, 'quant_scale':False}
+q3_config = {'nbits':3, 'group_size':32, 'quant_zero':False, 'quant_scale':False}
+quant_config  = HqqConfig(dynamic_config={
+  'self_attn.q_proj':q4_config,
+  'self_attn.k_proj':q4_config,
+  'self_attn.v_proj':q4_config,
+  'self_attn.o_proj':q4_config,
+
+  'mlp.gate_proj':q3_config,
+  'mlp.up_proj'  :q3_config,
+  'mlp.down_proj':q3_config,
+})
+```
+
+The second approach is especially interesting for quantizing Mixture-of-Experts (MoEs) because the experts are less affected by lower quantization settings.
+
+
+Then you simply quantize the model as follows
+``` Python
+model = transformers.AutoModelForCausalLM.from_pretrained(
+    model_id, 
+    torch_dtype=torch.float16, 
+    device_map="cuda", 
+    quantization_config=quant_config
+)
+```
+
+## Optimized Runtime
+
+HQQ supports various backends, including pure Pytorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training.
+For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090.
+For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend
diff --git a/docs/source/en/quantization/optimum.md b/docs/source/en/quantization/optimum.md
new file mode 100644
index 0000000000..d90b4c818e
--- /dev/null
+++ b/docs/source/en/quantization/optimum.md
@@ -0,0 +1,19 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimum
+
+The [Optimum](https://huggingface.co/docs/optimum/index) library supports quantization for Intel, Furiosa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. Consider using Optimum for quantization if you're using specific and optimized hardware like Intel CPUs, Furiosa NPUs or a model accelerator like ONNX Runtime.
\ No newline at end of file
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
new file mode 100644
index 0000000000..9404473bb6
--- /dev/null
+++ b/docs/source/en/quantization/overview.md
@@ -0,0 +1,58 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quantization
+
+Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
+
+<Tip>
+
+Interested in adding a new quantization method to Transformers? Read the [HfQuantizer](./contribute) guide to learn how!
+
+</Tip>
+
+> [!TIP]
+> If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization
+> in colaboration with DeepLearning.AI:
+
+> - [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
+
+> - [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
+
+
+## When to use what?
+
+The community has developed many quantization methods for various use cases. With Transformers, you can run any of these integrated methods depending on your use case because each method has their own pros and cons.
+
+For example, some quantization methods require calibrating the model with a dataset for more accurate and "extreme" compression (up to 1-2 bits quantization), while other methods work out of the box with on-the-fly quantization.
+
+Another parameter to consider is compatibility with your target device. Do you want to quantize on a CPU, GPU, or Apple silicon?
+
+In short, supporting a wide range of quantization methods allows you to pick the best quantization method for your specific use case.
+
+Use the table below to help you decide which quantization method to use.
+
+| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable | 🤗 transformers support | Link to library                             |
+|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
+| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | ?                       | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes)                        | 🟢                       | 🔴   |     🟢     | 🔴              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/TimDettmers/bitsandbytes |
+| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | 🟢            | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
+| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HQQ](./hqq)                                 | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
+
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
new file mode 100644
index 0000000000..d8dee26279
--- /dev/null
+++ b/docs/source/en/quantization/quanto.md
@@ -0,0 +1,66 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quanto
+
+<Tip>
+
+Try Quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
+
+</Tip>
+
+
+[🤗 Quanto](https://github.com/huggingface/quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
+
+- weights quantization (`float8`,`int8`,`int4`,`int2`)
+- activation quantization (`float8`,`int8`)
+- modality agnostic (e.g CV,LLM)
+- device agnostic (e.g CUDA,MPS,CPU)
+- compatibility with `torch.compile`
+- easy to add custom kernel for specific device
+- supports quantization aware training
+<!-- Add link to the blogpost -->
+
+Before you begin, make sure the following libraries are installed:
+
+```bash
+pip install quanto accelerate transformers
+```
+
+Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 
+
+The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [quanto](https://github.com/huggingface/quanto) library instead. 
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+quantization_config = QuantoConfig(weights="int8")
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0", quantization_config=quantization_config)
+```
+
+Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
+
+Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following becnhmark (llama-2-7b on perplexity metric). You can find more benchamarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/NousResearch-Llama-2-7b-hf_Perplexity.png" alt="llama-2-7b-quanto-perplexity" />
+  </div>
+</div>
+
+The library is versatible enough to be compatible with most PTQ optimization algorithms. The plan in the future is to integrate the most popular algorithms in the most seamless possible way (AWQ, Smoothquant).
\ No newline at end of file

From 6739e1d261f80caec34b8c8ac7a030907a4f75a2 Mon Sep 17 00:00:00 2001
From: Poedator <24738311+poedator@users.noreply.github.com>
Date: Thu, 23 May 2024 16:22:10 +0300
Subject: [PATCH 50/80] test_custom_4d_attention_mask skip with sliding window
 attn (#30833)

---
 tests/test_modeling_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 5480105054..20f5cf1ca2 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4407,6 +4407,8 @@ class ModelTesterMixin:
             if not model_class._supports_static_cache:
                 self.skipTest(f"{model_class.__name__} is not guaranteed to work with custom 4D attention masks")
             config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            if getattr(config, "sliding_window", 0) > 0:
+                self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
             model = model_class(config).to(device=torch_device, dtype=torch.float32)
 
             (

From 046c2ad7923bb0601f53d6f32c3092aa697ab14d Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 23 May 2024 09:01:29 -0500
Subject: [PATCH 51/80] Finish adding support for torch.compile dynamic shapes
 (#30919)

add torch.compile dynamic support
---
 src/transformers/models/dbrx/modeling_dbrx.py     | 6 +++++-
 src/transformers/models/jamba/modeling_jamba.py   | 8 ++++++--
 src/transformers/models/jetmoe/modeling_jetmoe.py | 8 +++++---
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
index 67f4b819e9..d34ce400cc 100644
--- a/src/transformers/models/dbrx/modeling_dbrx.py
+++ b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -615,13 +615,17 @@ class DbrxSdpaAttention(DbrxAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attn_pdrop if self.training else 0.0,
-            is_causal=causal_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py
index 3510b8fb5a..9648f281bc 100755
--- a/src/transformers/models/jamba/modeling_jamba.py
+++ b/src/transformers/models/jamba/modeling_jamba.py
@@ -725,14 +725,18 @@ class JambaSdpaAttention(JambaAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if self.is_causal and causal_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
-            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py
index 8b04575cfb..bd25c77fda 100644
--- a/src/transformers/models/jetmoe/modeling_jetmoe.py
+++ b/src/transformers/models/jetmoe/modeling_jetmoe.py
@@ -624,15 +624,17 @@ class JetMoeSdpaAttention(JetMoeAttention):
             key_states = key_states.contiguous()
             value_states = value_states.contiguous()
 
-        # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather
-        # relying on the `is_causal` argument.
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=causal_mask is None and q_len > 1,
+            is_causal=is_causal,
         )
 
         attn_output = attn_output.transpose(1, 2).contiguous()

From 5a74ae6dbe84da6017546ebd3765da6cd08dbc40 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Thu, 23 May 2024 16:36:49 +0200
Subject: [PATCH 52/80] FIX / Docs: Minor changes in quantization docs (#30985)

* Change in quantization docs

* Update overview.md

* Update docs/source/en/quantization/overview.md

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 docs/source/en/quantization/overview.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index 9404473bb6..c6c27edc42 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -24,14 +24,14 @@ Interested in adding a new quantization method to Transformers? Read the [HfQuan
 
 </Tip>
 
-> [!TIP]
-> If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization
-> in colaboration with DeepLearning.AI:
+<Tip>
 
-> - [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
+If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization in collaboration with DeepLearning.AI:
 
-> - [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
+* [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
+* [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
 
+</Tip>
 
 ## When to use what?
 
@@ -45,14 +45,14 @@ In short, supporting a wide range of quantization methods allows you to pick the
 
 Use the table below to help you decide which quantization method to use.
 
-| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable | 🤗 transformers support | Link to library                             |
+| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
 |-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
 | [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | ?                       | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
 | [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
 | [bitsandbytes](./bitsandbytes)                        | 🟢                       | 🔴   |     🟢     | 🔴              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/TimDettmers/bitsandbytes |
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | 🟢            | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
+| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
 | [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HQQ](./hqq)                                 | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
 

From 8366b5724103b10e1dcd8755d2ed6defff435fbb Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Thu, 23 May 2024 17:18:58 +0200
Subject: [PATCH 53/80] Fix accelerate failing tests (#30836)

* Fix accelerate tests

* fix clip

* skip dbrx tests

* fix GPTSan

* fix M2M100Model

* same fix as jamba

* fix mt5

* Fix T5Model

* Fix umt5 model

* fix switch_transformers

* fix whisper

* fix gptsan again

* fix siglip recent test

* skip siglip tests

* wrong place fixed
---
 src/transformers/models/bert/modeling_bert.py      |  2 +-
 src/transformers/models/clip/modeling_clip.py      |  6 ++++--
 .../models/m2m_100/modeling_m2m_100.py             |  2 +-
 src/transformers/models/mamba/modeling_mamba.py    |  1 +
 src/transformers/models/siglip/modeling_siglip.py  |  2 +-
 .../models/whisper/modeling_whisper.py             |  2 +-
 tests/models/bert/test_modeling_bert.py            |  1 +
 tests/models/dbrx/test_modeling_dbrx.py            | 14 ++++++++++++++
 .../test_modeling_gptsan_japanese.py               |  8 ++------
 tests/models/mt5/test_modeling_mt5.py              |  6 +-----
 tests/models/siglip/test_modeling_siglip.py        |  6 ++++++
 .../test_modeling_switch_transformers.py           |  6 +-----
 tests/models/t5/test_modeling_t5.py                |  6 +-----
 tests/models/umt5/test_modeling_umt5.py            |  6 +-----
 14 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 033dc6ba66..957944435b 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -964,7 +964,7 @@ class BertModel(BertPreTrainedModel):
     `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
     """
 
-    _no_split_modules = ["BertEmbeddings"]
+    _no_split_modules = ["BertEmbeddings", "BertLayer"]
 
     def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index ca2de67176..48e6dfa849 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -933,7 +933,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
 @add_start_docstrings(CLIP_START_DOCSTRING)
 class CLIPModel(CLIPPreTrainedModel):
     config_class = CLIPConfig
-    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
 
     def __init__(self, config: CLIPConfig):
         super().__init__(config)
@@ -1135,7 +1135,9 @@ class CLIPModel(CLIPPreTrainedModel):
 
         # cosine similarity as logits
         logit_scale = self.logit_scale.exp()
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device)) * logit_scale.to(
+            text_embeds.device
+        )
         logits_per_image = logits_per_text.t()
 
         loss = None
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index 9e9c68d68f..02bd68c10c 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -739,7 +739,7 @@ class M2M100PreTrainedModel(PreTrainedModel):
     config_class = M2M100Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["M2M100Attention"]
+    _no_split_modules = ["M2M100EncoderLayer", "M2M100DecoderLayer"]
     _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py
index a8309e0430..be42ba2330 100644
--- a/src/transformers/models/mamba/modeling_mamba.py
+++ b/src/transformers/models/mamba/modeling_mamba.py
@@ -244,6 +244,7 @@ class MambaMixer(nn.Module):
         # 2. Convolution sequence transformation
         if cache_params is not None:
             ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
             if cache_params.seqlen_offset > 0:
                 conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
                 conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py
index d6607f9ef7..8bf26345a4 100644
--- a/src/transformers/models/siglip/modeling_siglip.py
+++ b/src/transformers/models/siglip/modeling_siglip.py
@@ -959,7 +959,7 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module):
 class SiglipVisionModel(SiglipPreTrainedModel):
     config_class = SiglipVisionConfig
     main_input_name = "pixel_values"
-    _no_split_modules = ["SiglipVisionTransformer"]
+    _no_split_modules = ["SiglipVisionEmbeddings", "SiglipEncoderLayer", "SiglipMultiheadAttentionPoolingHead"]
 
     def __init__(self, config: SiglipVisionConfig):
         super().__init__(config)
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index f24da860a1..d2a7107c1e 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -1393,7 +1393,7 @@ class WhisperDecoder(WhisperPreTrainedModel):
                 inputs_embeds, past_key_values_length=past_key_values_length, position_ids=position_ids
             )
 
-        hidden_states = inputs_embeds + positions
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
         hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
 
         if self.gradient_checkpointing and self.training:
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index ff9a628020..ff5f65f26b 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -465,6 +465,7 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
         else {}
     )
     fx_compatible = True
+    model_split_percents = [0.5, 0.8, 0.9]
 
     # special case for ForPreTraining model
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py
index 66c9600084..31031c8f7a 100644
--- a/tests/models/dbrx/test_modeling_dbrx.py
+++ b/tests/models/dbrx/test_modeling_dbrx.py
@@ -354,6 +354,20 @@ class DbrxModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     def test_tied_weights_keys(self):
         pass
 
+    # Offload does not work with Dbrx models because of the forward of DbrxExperts where we chunk the experts.
+    # The issue is that the offloaded weights of the mlp layer are still on meta device (w1_chunked, v1_chunked, w2_chunked)
+    @unittest.skip("Dbrx models do not work with offload")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip("Dbrx models do not work with offload")
+    def test_disk_offload_safetensors(self):
+        pass
+
+    @unittest.skip("Dbrx models do not work with offload")
+    def test_disk_offload_bin(self):
+        pass
+
 
 @require_torch
 class DbrxModelIntegrationTest(unittest.TestCase):
diff --git a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
index 177323f747..8c48bb5017 100644
--- a/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
+++ b/tests/models/gptsan_japanese/test_modeling_gptsan_japanese.py
@@ -145,12 +145,10 @@ class GPTSanJapaneseTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCas
     is_encoder_decoder = False
     test_pruning = False
     test_headmasking = False
-    test_cpu_offload = False
-    test_disk_offload = False
     test_save_load_fast_init_to_base = False
     test_training = False
     # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
+    model_split_percents = [0.5, 0.8, 0.9]
 
     # TODO: Fix the failed tests when this model gets more usage
     def is_pipeline_test_to_skip(
@@ -198,10 +196,8 @@ class GPTSanJapaneseForConditionalGenerationTest(ModelTesterMixin, GenerationTes
     is_encoder_decoder = False
     test_pruning = False
     test_headmasking = False
-    test_cpu_offload = False
-    test_disk_offload = False
     # The small GPTSAN_JAPANESE model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = GPTSanJapaneseTester(self)
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index f6f138cb33..7f0d7d992a 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -574,7 +574,7 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     test_model_parallel = True
     is_encoder_decoder = True
     # The small MT5 model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = MT5ModelTester(self)
@@ -886,10 +886,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
             attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
             self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        pass
-
     @unittest.skip("Does not support conversations.")
     def test_pipeline_conversational(self):
         pass
diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py
index d65416a3fa..afd7912e5f 100644
--- a/tests/models/siglip/test_modeling_siglip.py
+++ b/tests/models/siglip/test_modeling_siglip.py
@@ -146,6 +146,12 @@ class SiglipVisionModelTest(ModelTesterMixin, unittest.TestCase):
     test_pruning = False
     test_resize_embeddings = False
     test_head_masking = False
+    # MP works but offload doesn't work when the MultiheadAttention is offloaded
+    # TODO: One potential solution would be to add to set preload_module_classes = ["SiglipMultiheadAttentionPoolingHead"]
+    # in the dispatch_model function
+    test_cpu_offload = False
+    test_disk_offload_safetensors = False
+    test_disk_offload_bin = False
 
     def setUp(self):
         self.model_tester = SiglipVisionModelTester(self)
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 6eb4ab4cb3..e1f46acc34 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -575,7 +575,7 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
     is_encoder_decoder = True
     test_torchscript = False
     # The small SWITCH_TRANSFORMERS model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = SwitchTransformersModelTester(self)
@@ -721,10 +721,6 @@ class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, Pipel
             attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
             self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        pass
-
 
 class SwitchTransformersEncoderOnlyModelTester:
     def __init__(
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index c215bda5e1..23e8640d5b 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -577,7 +577,7 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     test_model_parallel = True
     is_encoder_decoder = True
     # The small T5 model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = T5ModelTester(self)
@@ -889,10 +889,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
             attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
             self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        pass
-
     @unittest.skip("Does not support conversations.")
     def test_pipeline_conversational(self):
         pass
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index 16c93908cd..7d794a83d9 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -315,7 +315,7 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     test_missing_keys = True
     test_torchscript = True
     # The small UMT5 model needs higher percentages for CPU/MP tests
-    model_split_percents = [0.8, 0.9]
+    model_split_percents = [0.5, 0.8, 0.9]
 
     def setUp(self):
         self.model_tester = UMT5ModelTester(self)
@@ -536,10 +536,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
             self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
 
-    @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
-    def test_disk_offload(self):
-        pass
-
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )

From 21339a52133ba62ddc2ba5d12c68a6c053f7a4df Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Thu, 23 May 2024 23:53:07 +0800
Subject: [PATCH 54/80] [tests] add `torch.use_deterministic_algorithms` for
 XPU (#30774)

* add xpu check

* add marker

* add documentation

* update doc

* fix ci

* remove from global init

* fix
---
 src/transformers/testing_utils.py                    | 10 ++++++++++
 src/transformers/utils/__init__.py                   |  1 +
 src/transformers/utils/import_utils.py               | 12 ++++++++++++
 .../encoder_decoder/test_modeling_encoder_decoder.py |  3 ++-
 .../test_modeling_speech_encoder_decoder.py          |  4 +++-
 tests/models/speecht5/test_modeling_speecht5.py      |  2 ++
 6 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 4c6940564e..bd7484cc9f 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -116,6 +116,7 @@ from .utils import (
     is_torch_bf16_available_on_device,
     is_torch_bf16_cpu_available,
     is_torch_bf16_gpu_available,
+    is_torch_deterministic,
     is_torch_fp16_available_on_device,
     is_torch_neuroncore_available,
     is_torch_npu_available,
@@ -943,6 +944,15 @@ def require_torch_bf16_cpu(test_case):
     )(test_case)
 
 
+def require_deterministic_for_xpu(test_case):
+    if is_torch_xpu_available():
+        return unittest.skipUnless(is_torch_deterministic(), "test requires torch to use deterministic algorithms")(
+            test_case
+        )
+    else:
+        return test_case
+
+
 def require_torch_tf32(test_case):
     """Decorator marking a test that requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7."""
     return unittest.skipUnless(
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 9f9d5ceb8d..3769a0b4c7 100755
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -188,6 +188,7 @@ from .import_utils import (
     is_torch_bf16_gpu_available,
     is_torch_compile_available,
     is_torch_cuda_available,
+    is_torch_deterministic,
     is_torch_fp16_available_on_device,
     is_torch_fx_available,
     is_torch_fx_proxy,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index e58754db0d..75a3243c94 100755
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -296,6 +296,18 @@ def is_torch_available():
     return _torch_available
 
 
+def is_torch_deterministic():
+    """
+    Check whether pytorch uses deterministic algorithms by looking if torch.set_deterministic_debug_mode() is set to 1 or 2"
+    """
+    import torch
+
+    if torch.get_deterministic_debug_mode() == 0:
+        return False
+    else:
+        return True
+
+
 def is_hqq_available():
     return _hqq_available
 
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index 37eccec344..63bb11ee57 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -18,7 +18,7 @@ import tempfile
 import unittest
 
 from transformers import is_torch_available, logging
-from transformers.testing_utils import CaptureLogger, require_torch, slow, torch_device
+from transformers.testing_utils import CaptureLogger, require_deterministic_for_xpu, require_torch, slow, torch_device
 
 from ...test_modeling_common import ids_tensor
 from ..bart.test_modeling_bart import BartStandaloneDecoderModelTester
@@ -639,6 +639,7 @@ class EncoderDecoderMixin:
         loss.backward()
 
     @slow
+    @require_deterministic_for_xpu
     def test_real_model_save_load_from_pretrained(self):
         model_2 = self.get_pretrained_model()
         model_2.to(torch_device)
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index 9bf31fd481..08c3bc6178 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -18,7 +18,7 @@ import tempfile
 import unittest
 
 from transformers import is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import require_deterministic_for_xpu, require_torch, slow, torch_device
 
 from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
 from ..bert.test_modeling_bert import BertModelTester
@@ -422,6 +422,7 @@ class EncoderDecoderMixin:
         loss.backward()
 
     @slow
+    @require_deterministic_for_xpu
     def test_real_model_save_load_from_pretrained(self):
         model_2, inputs = self.get_pretrained_model_and_inputs()
         model_2.to(torch_device)
@@ -578,6 +579,7 @@ class Speech2TextBertModelTest(EncoderDecoderMixin, unittest.TestCase):
     def test_save_and_load_from_pretrained(self):
         pass
 
+    @require_deterministic_for_xpu
     # all published pretrained models are Speech2TextModel != Speech2TextEncoder
     def test_real_model_save_load_from_pretrained(self):
         pass
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index 6badcfa060..f28cf60eb5 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -22,6 +22,7 @@ import unittest
 from transformers import SpeechT5Config, SpeechT5HifiGanConfig
 from transformers.testing_utils import (
     is_torch_available,
+    require_deterministic_for_xpu,
     require_sentencepiece,
     require_tokenizers,
     require_torch,
@@ -1071,6 +1072,7 @@ class SpeechT5ForTextToSpeechIntegrationTests(unittest.TestCase):
             "Shape mismatch between generate_speech and generate methods.",
         )
 
+    @require_deterministic_for_xpu
     def test_one_to_many_generation(self):
         model = self.default_model
         processor = self.default_processor

From 892b13d3cf592bda9a6506ed526ba5b6c80bbefc Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Thu, 23 May 2024 17:23:59 +0100
Subject: [PATCH 55/80] Add a check that warmup_setps is either 0 or >= 1
 (#30764)

* Add a check that warmup_setps is either 0 or >= 1

Update training_args.py to add a check that warmup_setps is either 0 or >= 1. Otherwise, raise an error.

* Update src/transformers/training_args.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/training_args.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 60a81f7454..7c5d4b1c73 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1797,6 +1797,9 @@ class TrainingArguments:
                 " during training"
             )
 
+        if not isinstance(self.warmup_steps, int) or self.warmup_steps < 0 or 0 < self.warmup_steps <= 1:
+            raise ValueError("warmup_steps must be either 0 or > 1")
+
         if isinstance(self.fsdp, bool):
             self.fsdp = "full_shard" if self.fsdp else ""
         if isinstance(self.fsdp, str):

From 2a89673fe5b946097cd483964abe543a7868aa9b Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Thu, 23 May 2024 18:27:54 +0200
Subject: [PATCH 56/80] Update 4 `MptIntegrationTests` expected outputs
 (#30989)

* fix

* fix

* fix

* fix

* fix

* [run-slow] mpt

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 tests/models/mpt/test_modeling_mpt.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tests/models/mpt/test_modeling_mpt.py b/tests/models/mpt/test_modeling_mpt.py
index 40385fc3fd..e691214cca 100644
--- a/tests/models/mpt/test_modeling_mpt.py
+++ b/tests/models/mpt/test_modeling_mpt.py
@@ -447,7 +447,7 @@ class MptIntegrationTests(unittest.TestCase):
         )
 
         input_text = "Hello"
-        expected_output = 'Hello, I\'m a new user of the forum. I have a question about the "Safety"'
+        expected_output = "Hello, I'm a new user of the forum. I have a question about the \"Solaris"
 
         inputs = tokenizer(input_text, return_tensors="pt")
         outputs = model.generate(**inputs, max_new_tokens=20)
@@ -465,9 +465,7 @@ class MptIntegrationTests(unittest.TestCase):
         )
 
         input_text = "Hello"
-        expected_output = (
-            "Hello and welcome to the first day of the new release countdown for the month of May!\nToday"
-        )
+        expected_output = "Hello and welcome to the first episode of the new podcast, The Frugal Feminist.\n"
 
         inputs = tokenizer(input_text, return_tensors="pt")
         outputs = model.generate(**inputs, max_new_tokens=20)
@@ -491,8 +489,8 @@ class MptIntegrationTests(unittest.TestCase):
         inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(torch_device)
 
         expected_output = [
-            "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for over",
-            "Today I am going at the gym and then I am going to go to the grocery store and get some food. I am going to make",
+            "Hello my name is Tiffany and I am a mother of two beautiful children. I have been a nanny for the",
+            "Today I am going at the gym and then I am going to go to the grocery store. I am going to buy some food and some",
         ]
         outputs = model.generate(**inputs, max_new_tokens=20)
 
@@ -512,7 +510,7 @@ class MptIntegrationTests(unittest.TestCase):
 
         outputs = model(dummy_input, output_hidden_states=True)
 
-        expected_slice = torch.Tensor([-0.2539, -0.2178, -0.1953]).to(torch_device, torch.bfloat16)
+        expected_slice = torch.Tensor([-0.2520, -0.2178, -0.1953]).to(torch_device, torch.bfloat16)
         predicted_slice = outputs.hidden_states[-1][0, 0, :3]
 
         self.assertTrue(torch.allclose(expected_slice, predicted_slice, atol=1e-3, rtol=1e-3))

From 965e98dc54c4853b34b3a49810f24856a9fcdba8 Mon Sep 17 00:00:00 2001
From: Aritra Roy Gosthipaty <aritra.born2fly@gmail.com>
Date: Thu, 23 May 2024 22:18:49 +0530
Subject: [PATCH 57/80] [Port] TensorFlow implementation of Mistral (#29708)

* chore: initial commit

* chore: adding imports and inits

* chore: adding the causal and classification code

* chore: adding names to the layers

* chore: using single self attn layer

* chore: built the model and layers

* chore: start with testing

* chore: docstring change, transpose fix

* fix: rotary embedding

* chore: adding cache implementation

* remove unused torch

* chore: fixing the indexing issue

* make fix-copies

* Use modeling_tf_utils.keras

* make fixup

* chore: fixing tests

* chore: adding past key value logic

* chore: adding multi label classfication test

* fix: switching on the built parameters in the layers

* fixing repo consistency

* ruff formats

* style changes

* fix: tf and pt equivalence

* removing returns from docstrings

* fix docstrings

* fix docstrings

* removing todos

* fix copies

* fix docstring

* fix docstring

* chore: using easier rotate_half

* adding integration tests

* chore: addressing review related to rotary embedding layer

* review changes

* [run-slow] mistral

* skip: test save load after resize token embedding

* style

---------

Co-authored-by: Matt <rocketknight1@gmail.com>
---
 docs/source/en/index.md                       |    2 +-
 docs/source/en/model_doc/mistral.md           |   17 +-
 src/transformers/__init__.py                  |    9 +
 .../models/auto/modeling_tf_auto.py           |    3 +
 src/transformers/models/mistral/__init__.py   |   34 +-
 .../models/mistral/modeling_tf_mistral.py     | 1055 +++++++++++++++++
 src/transformers/utils/dummy_tf_objects.py    |   28 +
 .../mistral/test_modeling_tf_mistral.py       |  367 ++++++
 8 files changed, 1512 insertions(+), 3 deletions(-)
 create mode 100644 src/transformers/models/mistral/modeling_tf_mistral.py
 create mode 100644 tests/models/mistral/test_modeling_tf_mistral.py

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 9a8c2ebbe3..72237d1383 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -200,7 +200,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                 [Megatron-BERT](model_doc/megatron-bert)                 |       ✅        |         ❌         |      ❌      |
 |                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
 |                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
-|                       [Mistral](model_doc/mistral)                       |       ✅        |         ❌         |      ✅      |
+|                       [Mistral](model_doc/mistral)                       |       ✅        |         ✅         |      ✅      |
 |                       [Mixtral](model_doc/mixtral)                       |       ✅        |         ❌         |      ❌      |
 |                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
 |                           [MMS](model_doc/mms)                           |       ✅        |         ✅         |      ✅      |
diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md
index d4bc761060..17ce15b2b8 100644
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@@ -216,4 +216,19 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 ## FlaxMistralForCausalLM
 
 [[autodoc]] FlaxMistralForCausalLM
-    - __call__
\ No newline at end of file
+    - __call__
+
+## TFMistralModel
+
+[[autodoc]] TFMistralModel
+    - call
+
+## TFMistralForCausalLM
+
+[[autodoc]] TFMistralForCausalLM
+    - call
+
+## TFMistralForSequenceClassification
+
+[[autodoc]] TFMistralForSequenceClassification
+    - call
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8da7a8b3e3..fc8f6b1a9c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -3974,6 +3974,9 @@ else:
     _import_structure["models.mbart"].extend(
         ["TFMBartForConditionalGeneration", "TFMBartModel", "TFMBartPreTrainedModel"]
     )
+    _import_structure["models.mistral"].extend(
+        ["TFMistralForCausalLM", "TFMistralForSequenceClassification", "TFMistralModel", "TFMistralPreTrainedModel"]
+    )
     _import_structure["models.mobilebert"].extend(
         [
             "TFMobileBertForMaskedLM",
@@ -8067,6 +8070,12 @@ if TYPE_CHECKING:
             TFMBartModel,
             TFMBartPreTrainedModel,
         )
+        from .models.mistral import (
+            TFMistralForCausalLM,
+            TFMistralForSequenceClassification,
+            TFMistralModel,
+            TFMistralPreTrainedModel,
+        )
         from .models.mobilebert import (
             TFMobileBertForMaskedLM,
             TFMobileBertForMultipleChoice,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 2004495756..906fe411d0 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -65,6 +65,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
         ("lxmert", "TFLxmertModel"),
         ("marian", "TFMarianModel"),
         ("mbart", "TFMBartModel"),
+        ("mistral", "TFMistralModel"),
         ("mobilebert", "TFMobileBertModel"),
         ("mobilevit", "TFMobileViTModel"),
         ("mpnet", "TFMPNetModel"),
@@ -178,6 +179,7 @@ TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
         ("gpt-sw3", "TFGPT2LMHeadModel"),
         ("gpt2", "TFGPT2LMHeadModel"),
         ("gptj", "TFGPTJForCausalLM"),
+        ("mistral", "TFMistralForCausalLM"),
         ("openai-gpt", "TFOpenAIGPTLMHeadModel"),
         ("opt", "TFOPTForCausalLM"),
         ("rembert", "TFRemBertForCausalLM"),
@@ -320,6 +322,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
         ("layoutlm", "TFLayoutLMForSequenceClassification"),
         ("layoutlmv3", "TFLayoutLMv3ForSequenceClassification"),
         ("longformer", "TFLongformerForSequenceClassification"),
+        ("mistral", "TFMistralForSequenceClassification"),
         ("mobilebert", "TFMobileBertForSequenceClassification"),
         ("mpnet", "TFMPNetForSequenceClassification"),
         ("openai-gpt", "TFOpenAIGPTForSequenceClassification"),
diff --git a/src/transformers/models/mistral/__init__.py b/src/transformers/models/mistral/__init__.py
index abf1e32a4b..93e551e193 100644
--- a/src/transformers/models/mistral/__init__.py
+++ b/src/transformers/models/mistral/__init__.py
@@ -13,7 +13,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_flax_available, is_torch_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+)
 
 
 _import_structure = {
@@ -47,6 +53,19 @@ else:
         "FlaxMistralPreTrainedModel",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_mistral"] = [
+        "TFMistralModel",
+        "TFMistralForCausalLM",
+        "TFMistralForSequenceClassification",
+        "TFMistralPreTrainedModel",
+    ]
+
 
 if TYPE_CHECKING:
     from .configuration_mistral import MistralConfig
@@ -77,6 +96,19 @@ if TYPE_CHECKING:
             FlaxMistralPreTrainedModel,
         )
 
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_mistral import (
+            TFMistralForCausalLM,
+            TFMistralForSequenceClassification,
+            TFMistralModel,
+            TFMistralPreTrainedModel,
+        )
+
 
 else:
     import sys
diff --git a/src/transformers/models/mistral/modeling_tf_mistral.py b/src/transformers/models/mistral/modeling_tf_mistral.py
new file mode 100644
index 0000000000..3215439802
--- /dev/null
+++ b/src/transformers/models/mistral/modeling_tf_mistral.py
@@ -0,0 +1,1055 @@
+# coding=utf-8
+# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0  Mistral model."""
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPast,
+    TFCausalLMOutputWithPast,
+    TFSequenceClassifierOutputWithPast,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    get_tf_activation,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_mistral import MistralConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+
+def _make_causal_mask(input_ids_shape, dtype, past_key_values_length=0):
+    """
+    Make causal mask used for bi-directional self-attention, supporting both static and dynamic shapes.
+    """
+    bsz, tgt_len = input_ids_shape
+
+    # Create a matrix where only the lower triangle and diagonal are filled with zeros (causal mask)
+    mask = tf.fill((tgt_len, tgt_len), tf.dtypes.as_dtype(dtype).min)
+    mask_cond = tf.range(tgt_len)
+    mask = tf.where(mask_cond[:, None] >= mask_cond[None, :], 0.0, mask)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=dtype), mask], axis=-1)
+
+    if bsz is None:
+        # When batch size is dynamic, expand and tile
+        # so we can compile a functional model
+        mask = tf.expand_dims(mask, 0)
+        mask = tf.expand_dims(mask, 0)  # shape: (1, 1, tgt_len, tgt_len + past_key_values_length)
+        mask = tf.tile(mask, [bsz, 1, 1, 1])
+    else:
+        # When batch size is static, directly use broadcast_to
+        mask = tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length))
+
+    return mask
+
+
+def _expand_mask(mask, dtype, tgt_len=None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = shape_list(mask)
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = tf.expand_dims(tf.expand_dims(mask, 1), 1)
+    expanded_mask = tf.broadcast_to(expanded_mask, [bsz, 1, tgt_len, src_len])
+
+    inverted_mask = 1.0 - tf.cast(expanded_mask, dtype)
+
+    return tf.where(
+        tf.cast(inverted_mask, bool), tf.fill(dims=shape_list(inverted_mask), value=tf.float32.min), inverted_mask
+    )
+
+
+class TFMistralRMSNorm(keras.layers.Layer):
+    def __init__(self, hidden_size, eps=1e-6, **kwargs):
+        """
+        TFMistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.variance_epsilon = eps
+
+    def build(self, input_shape=None):
+        self.weight = self.add_weight(
+            name="weight",
+            shape=self.hidden_size,
+            initializer="ones",
+        )
+        if self.built:
+            return
+        self.built = True
+
+    def call(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = tf.cast(hidden_states, tf.float32)
+        variance = tf.reduce_mean(tf.square(hidden_states), axis=-1, keepdims=True)
+        hidden_states = tf.divide(hidden_states, tf.sqrt(variance + self.variance_epsilon))
+        return self.weight * tf.cast(hidden_states, input_dtype)
+
+
+# Verification: https://colab.research.google.com/gist/ariG23498/f8d8131b795a131b93d99e70ee93c192/scratchpad.ipynb
+class TFMistralRotaryEmbedding(keras.layers.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (tf.range(start=0, limit=self.dim, delta=2, dtype=tf.float32) / self.dim))
+
+    def call(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        t = tf.cast(tf.range(seq_len, dtype=tf.int64), self.inv_freq.dtype)
+        freqs = tf.einsum("i,j->ij", t, self.inv_freq)
+        emb = tf.concat([freqs, freqs], axis=-1)
+        cos_values = tf.cast(tf.cos(emb), x.dtype)
+        sin_values = tf.cast(tf.sin(emb), x.dtype)
+
+        cos_values = cos_values[:seq_len]
+        cos_values = tf.cast(cos_values, dtype=x.dtype)
+        sin_values = sin_values[:seq_len]
+        sin_values = tf.cast(sin_values, dtype=x.dtype)
+        return (cos_values, sin_values)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    mid_length = shape_list(x)[-1] // 2
+    x1 = x[..., :mid_length]
+    x2 = x[..., mid_length:]
+    return tf.concat([-x2, x1], axis=-1)
+
+
+# Verification: https://colab.research.google.com/gist/ariG23498/bb8474baeb33f4ae6ed7d77da5f7e7a4/scratchpad.ipynb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`tf.Tensor`): The query tensor.
+        k (`tf.Tensor`): The key tensor.
+        cos (`tf.Tensor`): The cosine part of the rotary embedding.
+        sin (`tf.Tensor`): The sine part of the rotary embedding.
+        position_ids (`tf.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(tf.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = tf.expand_dims(tf.gather(cos, position_ids), unsqueeze_dim)
+    sin = tf.expand_dims(tf.gather(sin, position_ids), unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class TFMistralMLP(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = keras.layers.Dense(self.intermediate_size, use_bias=False, name="gate_proj")
+        self.up_proj = keras.layers.Dense(self.intermediate_size, use_bias=False, name="up_proj")
+        self.down_proj = keras.layers.Dense(self.hidden_size, use_bias=False, name="down_proj")
+        self.act_fn = get_tf_activation(config.hidden_act)
+
+    def call(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "gate_proj", None) is not None:
+            with tf.name_scope(self.gate_proj.name):
+                self.gate_proj.build((self.hidden_size,))
+        if getattr(self, "up_proj", None) is not None:
+            with tf.name_scope(self.up_proj.name):
+                self.up_proj.build((self.hidden_size,))
+        if getattr(self, "down_proj", None) is not None:
+            with tf.name_scope(self.down_proj.name):
+                self.down_proj.build((self.intermediate_size,))
+
+
+# Verification: https://colab.research.google.com/gist/ariG23498/556d443d491966763ce2e7eee336efed/scratchpad.ipynb
+def repeat_kv(hidden_states: tf.Tensor, n_rep: int) -> tf.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = shape_list(hidden_states)
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = tf.expand_dims(hidden_states, 2)
+    hidden_states = tf.repeat(hidden_states, repeats=n_rep, axis=2)
+    return tf.reshape(hidden_states, (batch, num_key_value_heads * n_rep, slen, head_dim))
+
+
+class TFMistralAttention(keras.layers.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = keras.layers.Dense(self.num_heads * self.head_dim, use_bias=False, name="q_proj")
+        self.k_proj = keras.layers.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, name="k_proj")
+        self.v_proj = keras.layers.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, name="v_proj")
+        self.o_proj = keras.layers.Dense(self.hidden_size, use_bias=False, name="o_proj")
+
+        self.rotary_emb = TFMistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+            name="rotary_emb",
+        )
+        self.dropout = keras.layers.Dropout(rate=self.attention_dropout)
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        tensor = tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim))
+        tensor = tf.transpose(tensor, perm=(0, 2, 1, 3))
+        return tensor
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        training=None,
+        **kwargs,
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[Tuple[tf.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = shape_list(hidden_states)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = tf.transpose(
+            tf.reshape(query_states, (bsz, q_len, self.num_heads, self.head_dim)), perm=(0, 2, 1, 3)
+        )
+        key_states = tf.transpose(
+            tf.reshape(key_states, (bsz, q_len, self.num_key_value_heads, self.head_dim)), perm=(0, 2, 1, 3)
+        )
+        value_states = tf.transpose(
+            tf.reshape(value_states, (bsz, q_len, self.num_key_value_heads, self.head_dim)), perm=(0, 2, 1, 3)
+        )
+
+        kv_seq_len = shape_list(key_states)[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(
+            x=value_states,
+            seq_len=kv_seq_len,
+        )
+        query_states, key_states = apply_rotary_pos_emb(
+            q=query_states,
+            k=key_states,
+            cos=cos,
+            sin=sin,
+            position_ids=position_ids,
+        )
+
+        if past_key_value is not None:
+            # resue k, v, self_attention
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+        attn_weights = tf.cast(attn_weights, query_states.dtype)
+        attn_weights = self.dropout(
+            attn_weights,
+            training=training,
+        )
+        attn_output = tf.matmul(attn_weights, value_states)
+
+        attn_output = tf.transpose(attn_output, perm=(0, 2, 1, 3))
+        attn_output = tf.reshape(attn_output, (bsz, q_len, self.hidden_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build((self.hidden_size,))
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build((self.hidden_size,))
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build((self.hidden_size,))
+        if getattr(self, "o_proj", None) is not None:
+            with tf.name_scope(self.o_proj.name):
+                self.o_proj.build((self.num_heads * self.head_dim,))
+
+
+class TFMistralDecoderLayer(keras.layers.Layer):
+    def __init__(self, config: MistralConfig, layer_idx: int, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = TFMistralAttention(config, layer_idx, name="self_attn")
+
+        self.mlp = TFMistralMLP(config, name="mlp")
+        self.input_layernorm = TFMistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="input_layernorm")
+        self.post_attention_layernorm = TFMistralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, name="post_attention_layernorm"
+        )
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_value: Optional[Tuple[tf.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[tf.Tensor, Optional[Tuple[tf.Tensor, tf.Tensor]]]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(tf.Tensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "input_layernorm", None) is not None:
+            with tf.name_scope(self.input_layernorm.name):
+                self.input_layernorm.build(None)
+        if getattr(self, "post_attention_layernorm", None) is not None:
+            with tf.name_scope(self.post_attention_layernorm.name):
+                self.post_attention_layernorm.build(None)
+
+
+@keras_serializable
+class TFMistralMainLayer(keras.layers.Layer):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    config_class = MistralConfig
+
+    def __init__(self, config: MistralConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+
+        # TF and PT Embedding check: https://colab.research.google.com/gist/ariG23498/2b9826818875c9c4968c79cb19f55f2c/scratchpad.ipynb
+        self.embed_tokens = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.hidden_size,
+            name="embed_tokens",
+        )
+        self.layers = [
+            TFMistralDecoderLayer(config, layer_idx, name=f"layers.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
+        ]
+        self._attn_implementation = config._attn_implementation
+        self.norm = TFMistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="norm")
+        self.config = config
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        # if input_shape[-1] > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[List[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFBaseModelOutputWithPast]:
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = shape_list(inputs_embeds)
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = shape_list(past_key_values[0][0])[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            position_ids = tf.range(
+                start=past_key_values_length, limit=seq_length + past_key_values_length, dtype=tf.int64
+            )
+            position_ids = tf.reshape(tf.expand_dims(position_ids, 0), (-1, seq_length))
+
+        else:
+            position_ids = tf.cast(tf.reshape(position_ids, (-1, seq_length)), tf.int64)
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is None:
+            attention_mask = tf.ones((batch_size, seq_length_with_past), dtype=tf.bool)
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return TFBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_tokens", None) is not None:
+            with tf.name_scope(self.embed_tokens.name):
+                self.embed_tokens.build(None)
+        if getattr(self, "norm", None) is not None:
+            with tf.name_scope(self.norm.name):
+                self.norm.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+MISTRAL_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `model` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`MistralConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class TFMistralPreTrainedModel(TFPreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(tf.Tensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class TFMistralModel(TFMistralPreTrainedModel):
+    def __init__(self, config: MistralConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.model = TFMistralMainLayer(config, name="model")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids: tf.Tensor = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[List[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFBaseModelOutputWithPast]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+
+
+class TFMistralForCausalLM(TFMistralPreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.model = TFMistralMainLayer(config, name="model")
+        self.vocab_size = config.vocab_size
+        self.lm_head = keras.layers.Dense(
+            config.vocab_size,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="lm_head",
+        )
+        self.config = config
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def call(
+        self,
+        input_ids: tf.Tensor = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[List[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+                or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = tf.cast(logits, tf.float32)
+
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels, shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            input_ids = tf.expand_dims(input_ids[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+        }
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build((self.config.hidden_size,))
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+class TFMistralForSequenceClassification(TFMistralPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.model = TFMistralMainLayer(config, name="model")
+        self.score = keras.layers.Dense(
+            self.num_labels,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="score",
+        )
+        self.config = config
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def call(
+        self,
+        input_ids: tf.Tensor = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[List[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFSequenceClassifierOutputWithPast]:
+        r"""
+        Args:
+            labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+
+        transformer_outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        logits_shape = shape_list(logits)
+        in_logits = None
+
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
+                    - 1
+                )
+                sequence_lengths = tf.where(
+                    sequence_lengths >= 0,
+                    sequence_lengths,
+                    tf.cast(shape_list(input_ids[-1]), sequence_lengths.dtype) - 1,
+                )
+                in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        if labels is not None:
+            if self.config.pad_token_id is None and logits_shape[0] != 1:
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+            if not tf.is_tensor(sequence_lengths):
+                in_logits = logits[0 : logits_shape[0], sequence_lengths]
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(in_logits, [-1, self.num_labels]))
+        pooled_logits = in_logits if in_logits is not None else logits
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "score", None) is not None:
+            with tf.name_scope(self.score.name):
+                self.score.build((self.config.hidden_size,))
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index e0b396c716..337b0938b3 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1801,6 +1801,34 @@ class TFMBartPreTrainedModel(metaclass=DummyObject):
         requires_backends(self, ["tf"])
 
 
+class TFMistralForCausalLM(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFMistralForSequenceClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFMistralModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFMistralPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 class TFMobileBertForMaskedLM(metaclass=DummyObject):
     _backends = ["tf"]
 
diff --git a/tests/models/mistral/test_modeling_tf_mistral.py b/tests/models/mistral/test_modeling_tf_mistral.py
new file mode 100644
index 0000000000..df07e96bb1
--- /dev/null
+++ b/tests/models/mistral/test_modeling_tf_mistral.py
@@ -0,0 +1,367 @@
+# coding=utf-8
+# Copyright 2024 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the TF 2.0 Mistral model."""
+
+import unittest
+
+import numpy as np
+
+from transformers import AutoTokenizer, MistralConfig, is_tf_available
+from transformers.testing_utils import (
+    require_tf,
+    slow,
+)
+
+from ...generation.test_tf_utils import TFGenerationIntegrationTests
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers.models.mistral.modeling_tf_mistral import (
+        TFMistralForCausalLM,
+        TFMistralForSequenceClassification,
+        TFMistralModel,
+    )
+
+
+class TFMistralModelTester:
+    def __init__(self, parent):
+        self.parent = parent
+        self.batch_size = 13
+        self.seq_length = 7
+        self.is_training = True
+        self.use_input_mask = True
+        self.use_token_type_ids = False
+        self.use_labels = True
+        self.vocab_size = 99
+        self.hidden_size = 32
+        self.num_hidden_layers = 2
+        self.num_attention_heads = 4
+        self.num_key_value_heads = 2
+        self.intermediate_size = 37
+        self.hidden_act = "gelu"
+        self.hidden_dropout_prob = 0.1
+        self.attention_probs_dropout_prob = 0.1
+        self.max_position_embeddings = 512
+        self.type_vocab_size = 16
+        self.type_sequence_label_size = 2
+        self.initializer_range = 0.02
+        self.num_labels = 3
+        self.num_choices = 4
+        self.pad_token_id = 0
+        self.scope = None
+        self.bos_token_id = self.vocab_size - 1
+        self.eos_token_id = self.vocab_size - 1
+        self.pad_token_id = self.vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length], self.vocab_size)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = MistralConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+        )
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
+
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = TFMistralModel(config=config)
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = TFMistralModel(config)
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = TFMistralForCausalLM(config=config)
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = TFMistralForCausalLM(config=config)
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([input_mask, next_mask], axis=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(np.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_tf
+class TFMistralModelTest(TFModelTesterMixin, TFGenerationIntegrationTests, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TFMistralModel, TFMistralForCausalLM, TFMistralForSequenceClassification) if is_tf_available() else ()
+    )
+    all_generative_model_classes = (TFMistralForCausalLM,) if is_tf_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": TFMistralModel,
+            "text-classification": TFMistralForSequenceClassification,
+            "text-generation": TFMistralForCausalLM,
+            "zero-shot": TFMistralForSequenceClassification,
+        }
+        if is_tf_available()
+        else {}
+    )
+    test_onnx = False
+    test_pruning = False
+    test_missing_keys = False
+    test_head_masking = False
+
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = TFMistralModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_Mistral_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = tf.not_equal(input_ids, 1)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = TFMistralForSequenceClassification(config)
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Mistral_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = tf.not_equal(input_ids, 1)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = TFMistralForSequenceClassification(config)
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Mistral_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = tf.not_equal(input_ids, 1)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = TFMistralForSequenceClassification(config)
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    @unittest.skip("Mistral buffers include complex numbers, which breaks this test")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip("Mistral uses GQA on all models so the KV cache is a non standard format")
+    def test_past_key_values_format(self):
+        pass
+
+    @unittest.skip("Vocab resizing is not supported")
+    def test_save_load_after_resize_token_embeddings(self):
+        pass
+
+
+@require_tf
+class TFMistralIntegrationTest(unittest.TestCase):
+    @slow
+    def test_model_7b_logits(self):
+        input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
+        model = TFMistralForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-MistralForCausalLM", from_pt=True
+        )
+        input_ids = tf.constant([input_ids])
+        out = model(input_ids).logits
+        # Expected mean on dim = -1
+        EXPECTED_MEAN = tf.constant(
+            [[-1.281e-04, -2.869e-04, -9.989e-05, -8.995e-05, 2.494e-04, -3.083e-04, -2.672e-04, -1.239e-04]]
+        )
+        tf.debugging.assert_near(tf.reduce_mean(out, axis=-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2)
+        # slicing logits[0, 0, 0:30]
+        EXPECTED_SLICE = tf.constant([0.1033,  0.1493, -0.0041, -0.0021, -0.1686,  0.0356,  0.0812,  0.2218, -0.1257,  0.1920,  0.0929,  0.1181,  0.0111,  0.0395, -0.0064,  0.1712, -0.0751,  0.0625, -0.2409,  0.1541, -0.1271, -0.2296, -0.0099, -0.0160, 0.0311, -0.0824, -0.1518,  0.0722,  0.0187,  0.0484])  # fmt: skip
+        tf.debugging.assert_near(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4)
+
+    @slow
+    def test_model_7b_generation(self):
+        EXPECTED_TEXT_COMPLETION = """My favourite condiment is  Werk a EgyadjustPrintfigiousPDFPHPct guns Ein motor conceti barSequ内 infrastructure millretval"""
+        prompt = "My favourite condiment is "
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-MistralForCausalLM", use_fast=False)
+        model = TFMistralForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-MistralForCausalLM", from_pt=True
+        )
+        input_ids = tokenizer.encode(prompt, return_tensors="tf")
+
+        # greedy generation outputs
+        generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0)
+        text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

From 6d3d5b1039559a7cc03965cc8c2a6ff0291d83fc Mon Sep 17 00:00:00 2001
From: Yasmin Moslem <48152713+ymoslem@users.noreply.github.com>
Date: Thu, 23 May 2024 17:53:26 +0100
Subject: [PATCH 58/80] Remove deprecated properties in tokenization_nllb.py
 and tokenization_nllb_fast.py (#29834)

* Fix typo in tokenization_nllb.py

Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability.

* Fix typo in tokenization_nllb_fast.py

Change `adder_tokens_decoder` into `added_tokens_decoder` and improve the warning's readability.

* Remove deprecated attributes in tokenization_nllb.py

Remove deprecated attributes: `lang_code_to_id`, `fairseq_tokens_to_ids`, `id_to_lang_code`, and `fairseq_ids_to_tokens`

* Remove deprecated attribute in tokenization_nllb_fast.py

Remove deprecated attribute `lang_code_to_id`

* Remove deprecated properties in tokenization_nllb.py

Remove deprecated properties - fix format

* Remove deprecated properties in tokenization_nllb_fast.py

Remove deprecated properties - fix format

* Update test_tokenization_nllb.py

* update test_tokenization_nllb.py

* Update tokenization_nllb.py

* Update test_tokenization_seamless_m4t.py

* Update test_tokenization_seamless_m4t.py
---
 .../models/nllb/tokenization_nllb.py          | 44 -------------------
 .../models/nllb/tokenization_nllb_fast.py     | 12 -----
 tests/models/nllb/test_tokenization_nllb.py   | 12 -----
 .../test_tokenization_seamless_m4t.py         |  9 ----
 4 files changed, 77 deletions(-)

diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py
index f517121157..b5ae28b812 100644
--- a/src/transformers/models/nllb/tokenization_nllb.py
+++ b/src/transformers/models/nllb/tokenization_nllb.py
@@ -159,18 +159,6 @@ class NllbTokenizer(PreTrainedTokenizer):
         self.fairseq_offset = 1
         self.sp_model_size = len(self.sp_model)
 
-        # Everything that follows is kept for BC and will be removed in v4.38
-        self._fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
-        language_codes = FAIRSEQ_LANGUAGE_CODES if additional_special_tokens is None else additional_special_tokens
-        self._lang_code_to_id = {
-            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(language_codes)
-        }
-        self._id_to_lang_code = {v: k for k, v in self._lang_code_to_id.items()}
-        self._fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
-
-        self._fairseq_tokens_to_ids.update(self.lang_code_to_id)
-        self._fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
@@ -217,38 +205,6 @@ class NllbTokenizer(PreTrainedTokenizer):
     def src_lang(self) -> str:
         return self._src_lang
 
-    @property
-    def lang_code_to_id(self):
-        logger.warning_once(
-            "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._lang_code_to_id
-
-    @property
-    def fairseq_tokens_to_ids(self):
-        logger.warning_once(
-            "the `fairseq_tokens_to_ids` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._fairseq_tokens_to_ids
-
-    @property
-    def id_to_lang_code(self):
-        logger.warning_once(
-            "the `id_to_lang_code` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._id_to_lang_code
-
-    @property
-    def fairseq_ids_to_tokens(self):
-        logger.warning_once(
-            "the `_fairseq_ids_to_tokens` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._fairseq_ids_to_tokens
-
     @src_lang.setter
     def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py
index 2004580bf6..013dbc97b3 100644
--- a/src/transformers/models/nllb/tokenization_nllb_fast.py
+++ b/src/transformers/models/nllb/tokenization_nllb_fast.py
@@ -161,23 +161,11 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
             **kwargs,
         )
 
-        self._lang_code_to_id = {
-            lang_code: self.convert_tokens_to_ids(str(lang_code)) for lang_code in additional_special_tokens
-        }
-
         self._src_lang = src_lang if src_lang is not None else "eng_Latn"
         self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    @property
-    def lang_code_to_id(self):
-        logger.warning_once(
-            "the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder`"
-            " this attribute will be removed in `transformers` v4.38"
-        )
-        return self._lang_code_to_id
-
     @property
     def can_save_slow_tokenizer(self) -> bool:
         return os.path.isfile(self.vocab_file) if self.vocab_file else False
diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py
index 92134c3f8b..b0ef147db2 100644
--- a/tests/models/nllb/test_tokenization_nllb.py
+++ b/tests/models/nllb/test_tokenization_nllb.py
@@ -367,11 +367,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
         cls.pad_token_id = 1
         return cls
 
-    def test_language_codes(self):
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Arab"], 256001)
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ace_Latn"], 256002)
-        self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["fra_Latn"], 256057)
-
     def test_enro_tokenizer_batch_encode_plus(self):
         ids = self.tokenizer.batch_encode_plus(self.src_text).input_ids[0]
         self.assertListEqual(self.expected_src_tokens, ids)
@@ -397,13 +392,6 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
     def test_mask_token(self):
         self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["<mask>", "ar_AR"]), [256203, 3])
 
-    def test_special_tokens_unaffacted_by_save_load(self):
-        tmpdirname = tempfile.mkdtemp()
-        original_special_tokens = self.tokenizer.fairseq_tokens_to_ids
-        self.tokenizer.save_pretrained(tmpdirname)
-        new_tok = NllbTokenizer.from_pretrained(tmpdirname)
-        self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens)
-
     @require_torch
     def test_enro_tokenizer_prepare_batch(self):
         batch = self.tokenizer(
diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
index 2e65d01ead..7ccc04bc54 100644
--- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tempfile
 import unittest
 
 from transformers import (
@@ -499,14 +498,6 @@ class SeamlessM4TDistilledIntegrationTest(unittest.TestCase):
         self.assertEqual(ids[0], EN_CODE)
         self.assertEqual(len(ids), desired_max_length)
 
-    # Copied from tests.models.nllb.test_tokenization_nllb.NllbDistilledIntegrationTest.test_special_tokens_unaffacted_by_save_load with fairseq_tokens_to_ids->additional_special_tokens, Nllb->SeamlessM4T, Dict->List
-    def test_special_tokens_unaffacted_by_save_load(self):
-        tmpdirname = tempfile.mkdtemp()
-        original_special_tokens = self.tokenizer.additional_special_tokens
-        self.tokenizer.save_pretrained(tmpdirname)
-        new_tok = SeamlessM4TTokenizer.from_pretrained(tmpdirname)
-        self.assertListEqual(new_tok.additional_special_tokens, original_special_tokens)
-
     @require_torch
     def test_enro_tokenizer_prepare_batch(self):
         batch = self.tokenizer(

From 6657fb5fed3f94812ef97dda16e91eee77f03d61 Mon Sep 17 00:00:00 2001
From: Matthias Gerstgrasser <matthias@gerstgrasser.net>
Date: Thu, 23 May 2024 12:29:00 -0700
Subject: [PATCH 59/80] Bugfix: WandbCallback uploads initial model checkpoint
 (#30897)

* fix wandb always uploading initial model

* Update comment.

* Optionally log initial model

* Revert "Optionally log initial model"

This reverts commit 9602cc1fad3feaf218f82a7339a194d3d2fbb946.
---
 src/transformers/integrations/integration_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 66f8c1da8f..ffd77dbe20 100755
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -797,7 +797,7 @@ class WandbCallback(TrainerCallback):
             except AttributeError:
                 logger.info("Could not log the number of model parameters in Weights & Biases.")
 
-            # log the initial model and architecture to an artifact
+            # log the initial model architecture to an artifact
             with tempfile.TemporaryDirectory() as temp_dir:
                 model_name = (
                     f"model-{self._wandb.run.id}"
@@ -813,7 +813,6 @@ class WandbCallback(TrainerCallback):
                         "initial_model": True,
                     },
                 )
-                model.save_pretrained(temp_dir)
                 # add the architecture to a separate text file
                 save_model_architecture_to_file(model, temp_dir)
 

From 7f6e87413f03b41b0896b372e826b27df08ed4c8 Mon Sep 17 00:00:00 2001
From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>
Date: Fri, 24 May 2024 10:03:00 +0200
Subject: [PATCH 60/80] add prefix space ignored in llama #29625 (#30964)

* add prefix space ignored in llama #29625

* adding test with add_prefix_space=False

* ruff

---------

Co-authored-by: Ita Zaporozhets <itazaporozhets@Itas-MBP.localdomain>
---
 src/transformers/models/llama/tokenization_llama_fast.py | 1 +
 tests/models/llama/test_tokenization_llama.py            | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
index 580290841c..44168fbedc 100644
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ b/src/transformers/models/llama/tokenization_llama_fast.py
@@ -163,6 +163,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
             use_default_system_prompt=use_default_system_prompt,
+            add_prefix_space=add_prefix_space,
             legacy=legacy,
             **kwargs,
         )
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 84bd6d7a9d..fba883513f 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -602,6 +602,10 @@ class LlamaIntegrationTest(unittest.TestCase):
         self.assertEqual(decoded_tokens, "hello")
 
     def test_no_prefix_space(self):
+        tokenizer_no_prefix_space = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", add_prefix_space=False)
+        no_prefix_space_tokens = tokenizer_no_prefix_space.tokenize("Hey")
+        self.assertEqual(no_prefix_space_tokens, ["H", "ey"])
+
         tokenizer = LlamaTokenizerFast.from_pretrained(
             "huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False
         )

From 21e259d8c578759bc1062d2537f97e766f78cfa9 Mon Sep 17 00:00:00 2001
From: Kevin Koehncke <kkoehncke@gmail.com>
Date: Fri, 24 May 2024 01:43:44 -0700
Subject: [PATCH 61/80] Fix training speed regression introduced by "optimize
 VRAM for calculating pos_bias in LayoutLM v2, v3 (#26139)" (#30988)

* Revert "optimize VRAM for calculating pos_bias in LayoutLM v2, v3 (#26139)"

This reverts commit a7e0ed829c398a67a641a401e23dae13e2f8b217.

* Instead of reverting commit, wrap indexing in torch.no_grad context

* Apply wrapping in LayoutLMv2

* Add comments explaining reason for no_grad

* Fix code format

---------

Co-authored-by: Kevin Koehncke <kevin.koehncke@uipath.com>
---
 .../models/layoutlmv2/modeling_layoutlmv2.py     | 16 +++++++++++++---
 .../models/layoutlmv3/modeling_layoutlmv3.py     | 16 +++++++++++++---
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 70cb1f6e9f..dd7b249f84 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -383,7 +383,12 @@ class LayoutLMv2Encoder(nn.Module):
             num_buckets=self.rel_pos_bins,
             max_distance=self.max_rel_pos,
         )
-        rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
         rel_pos = rel_pos.contiguous()
         return rel_pos
 
@@ -402,8 +407,13 @@ class LayoutLMv2Encoder(nn.Module):
             num_buckets=self.rel_2d_pos_bins,
             max_distance=self.max_rel_2d_pos,
         )
-        rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
-        rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
+            rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
         rel_pos_x = rel_pos_x.contiguous()
         rel_pos_y = rel_pos_y.contiguous()
         rel_2d_pos = rel_pos_x + rel_pos_y
diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
index 43ba0f522f..c258e9e3af 100644
--- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py
@@ -600,7 +600,12 @@ class LayoutLMv3Encoder(nn.Module):
             num_buckets=self.rel_pos_bins,
             max_distance=self.max_rel_pos,
         )
-        rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos = self.rel_pos_bias.weight.t()[rel_pos].permute(0, 3, 1, 2)
         rel_pos = rel_pos.contiguous()
         return rel_pos
 
@@ -619,8 +624,13 @@ class LayoutLMv3Encoder(nn.Module):
             num_buckets=self.rel_2d_pos_bins,
             max_distance=self.max_rel_2d_pos,
         )
-        rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
-        rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
+        # Since this is a simple indexing operation that is independent of the input,
+        # no need to track gradients for this operation
+        #
+        # Without this no_grad context, training speed slows down significantly
+        with torch.no_grad():
+            rel_pos_x = self.rel_pos_x_bias.weight.t()[rel_pos_x].permute(0, 3, 1, 2)
+            rel_pos_y = self.rel_pos_y_bias.weight.t()[rel_pos_y].permute(0, 3, 1, 2)
         rel_pos_x = rel_pos_x.contiguous()
         rel_pos_y = rel_pos_y.contiguous()
         rel_2d_pos = rel_pos_x + rel_pos_y

From 03935d300d60110bb86edb49d2315089cfb19789 Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Fri, 24 May 2024 11:00:59 +0200
Subject: [PATCH 62/80] Do not trigger autoconversion if local_files_only
 (#31004)

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 354962bab0..e58185dc7c 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -3393,7 +3393,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                         if resolved_archive_file is not None:
                             is_sharded = True
 
-                    if resolved_archive_file is not None:
+                    if not local_files_only and resolved_archive_file is not None:
                         if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]:
                             # If the PyTorch file was found, check if there is a safetensors file on the repository
                             # If there is no safetensors file on the repositories, start an auto conversion

From 5855afd1f38e8b263710a04b7619662716428a36 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 24 May 2024 12:00:50 +0200
Subject: [PATCH 63/80] pin `uv==0.1.45` (#31006)

* fix

* [push-ci-image]

* run with latest

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/consistency.dockerfile       | 2 +-
 docker/custom-tokenizers.dockerfile | 2 +-
 docker/examples-tf.dockerfile       | 2 +-
 docker/examples-torch.dockerfile    | 2 +-
 docker/exotic-models.dockerfile     | 2 +-
 docker/jax-light.dockerfile         | 2 +-
 docker/pipeline-tf.dockerfile       | 2 +-
 docker/pipeline-torch.dockerfile    | 2 +-
 docker/quality.dockerfile           | 2 +-
 docker/tf-light.dockerfile          | 2 +-
 docker/torch-jax-light.dockerfile   | 2 +-
 docker/torch-light.dockerfile       | 2 +-
 docker/torch-tf-light.dockerfile    | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile
index 6bdcefc458..e516898ded 100644
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@@ -4,7 +4,7 @@ USER root
 ARG REF=main
 RUN apt-get update && apt-get install -y time git pkg-config make git-lfs
 ENV VIRTUAL_ENV=/usr/local
-RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
+RUN pip install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
 RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir tensorflow-cpu tf-keras
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]" 
diff --git a/docker/custom-tokenizers.dockerfile b/docker/custom-tokenizers.dockerfile
index 19860841da..b65fc57774 100644
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@@ -3,7 +3,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 
 RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
 RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
diff --git a/docker/examples-tf.dockerfile b/docker/examples-tf.dockerfile
index 898f199504..2f719c2faa 100644
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@@ -4,7 +4,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
 RUN apt-get install -y g++ cmake
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
 RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
diff --git a/docker/examples-torch.dockerfile b/docker/examples-torch.dockerfile
index fa8d63865d..fd50e8f4e1 100644
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@@ -3,7 +3,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
diff --git a/docker/exotic-models.dockerfile b/docker/exotic-models.dockerfile
index ea2db36740..248f05e91c 100644
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@@ -4,7 +4,7 @@ ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
diff --git a/docker/jax-light.dockerfile b/docker/jax-light.dockerfile
index d9ca5948f9..7f8f1e536e 100644
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@@ -4,7 +4,7 @@ ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
 RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
diff --git a/docker/pipeline-tf.dockerfile b/docker/pipeline-tf.dockerfile
index 3c83a254c6..d2de44b4d7 100644
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@@ -4,7 +4,7 @@ ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile
index a496777c2c..8816338845 100644
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@@ -4,7 +4,7 @@ ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
diff --git a/docker/quality.dockerfile b/docker/quality.dockerfile
index 631467a599..44fbcd91f0 100644
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@@ -4,6 +4,6 @@ ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y time git 
 ENV VIRTUAL_ENV=/usr/local
-RUN pip install uv &&  uv venv
+RUN pip install uv==0.1.45 &&  uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
 RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/tf-light.dockerfile b/docker/tf-light.dockerfile
index 894f55870d..65e6660787 100644
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
 RUN apt-get install -y  cmake
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
 RUN pip uninstall -y transformers
diff --git a/docker/torch-jax-light.dockerfile b/docker/torch-jax-light.dockerfile
index cfdbef788b..351eee2775 100644
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@@ -4,7 +4,7 @@ ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile
index 2c169d5e6d..ad571f79d7 100644
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@@ -4,7 +4,7 @@ ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
diff --git a/docker/torch-tf-light.dockerfile b/docker/torch-tf-light.dockerfile
index f7122930b4..56655914d2 100644
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@@ -5,7 +5,7 @@ RUN echo ${REF}
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

From 42d8dd8716d45291733191464e249a83b93e4d86 Mon Sep 17 00:00:00 2001
From: Yixiang Gao <yixiangg310573@gmail.com>
Date: Fri, 24 May 2024 05:13:58 -0500
Subject: [PATCH 64/80] Perceiver interpolate position embedding (#30979)

* add test that currently fails

* test passed

* all perceiver passed

* fixup, style, quality, repo-consistency, all passed

* Apply suggestions from code review: default to False + compute sqrt once only

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* fix a minor bracket

* replace dim with self._num_channels

* add arguments to the rest preprocessors

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/perceiver/modeling_perceiver.py    | 83 ++++++++++++++++---
 .../perceiver/test_modeling_perceiver.py      | 20 +++++
 2 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 4921e292d6..4f9f0c05af 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -699,13 +699,24 @@ PERCEIVER_INPUTS_DOCSTRING = r"""
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
 
 @add_start_docstrings(
-    """The Perceiver: a scalable, fully attentional architecture.""",
+    """The Perceiver: a scalable, fully attentional architecture.
+
+    <Tip>
+
+        Note that it's possible to fine-tune Perceiver on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """,
     PERCEIVER_MODEL_START_DOCSTRING,
 )
 class PerceiverModel(PerceiverPreTrainedModel):
@@ -754,6 +765,7 @@ class PerceiverModel(PerceiverPreTrainedModel):
         head_mask: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, PerceiverModelOutput]:
         r"""
@@ -857,7 +869,9 @@ class PerceiverModel(PerceiverPreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if self.input_preprocessor is not None:
-            inputs, modality_sizes, inputs_without_pos = self.input_preprocessor(inputs)
+            inputs, modality_sizes, inputs_without_pos = self.input_preprocessor(
+                inputs, interpolate_pos_encoding=interpolate_pos_encoding
+            )
         else:
             modality_sizes = None
             inputs_without_pos = None
@@ -1247,6 +1261,7 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         labels: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: bool = False,
         return_dict: Optional[bool] = None,
         pixel_values: Optional[torch.Tensor] = None,
     ) -> Union[Tuple, PerceiverClassifierOutput]:
@@ -1295,6 +1310,7 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
             return_dict=return_dict,
         )
         logits = outputs.logits if return_dict else outputs[0]
@@ -2749,9 +2765,31 @@ class PerceiverTrainablePositionEncoding(PerceiverAbstractPositionEncoding):
     def output_size(self, *args, **kwargs) -> int:
         return self._num_channels
 
-    def forward(self, batch_size: int) -> torch.Tensor:
+    def interpolate_pos_encoding(self, position_embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        num_positions = position_embeddings.shape[0]
+        new_height = new_width = math.sqrt(num_positions)
+        position_embeddings = position_embeddings.reshape(
+            1, int(new_height), int(new_width), self._num_channels
+        ).permute(0, 3, 1, 2)
+        position_embeddings = nn.functional.interpolate(
+            position_embeddings,
+            scale_factor=(height / new_height, width / new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+        position_embeddings = position_embeddings.reshape(1, self._num_channels, -1).permute(0, 2, 1).squeeze(0)
+        return position_embeddings
+
+    def forward(
+        self, batch_size: int, interpolate_pos_encoding: bool = False, input_size: torch.Size = None
+    ) -> torch.Tensor:
         position_embeddings = self.position_embeddings
 
+        if interpolate_pos_encoding:
+            height, width = input_size
+            height, width = height + 0.1, width + 0.1
+            position_embeddings = self.interpolate_pos_encoding(position_embeddings, height, width)
+
         if batch_size is not None:
             position_embeddings = position_embeddings.expand(batch_size, -1, -1)
         return position_embeddings
@@ -2859,7 +2897,13 @@ class PerceiverTextPreprocessor(AbstractPreprocessor):
     def num_channels(self) -> int:
         return self.config.d_model
 
-    def forward(self, inputs: torch.LongTensor, pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True):
+    def forward(
+        self,
+        inputs: torch.LongTensor,
+        pos: Optional[torch.Tensor] = None,
+        network_input_is_1d: bool = True,
+        interpolate_pos_encoding: bool = False,
+    ):
         embeddings_without_pos = self.embeddings(inputs)
 
         seq_length = inputs.shape[1]
@@ -3139,7 +3183,9 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
 
         return inp_dim + pos_dim
 
-    def _build_network_inputs(self, inputs: torch.Tensor, network_input_is_1d: bool = True):
+    def _build_network_inputs(
+        self, inputs: torch.Tensor, network_input_is_1d: bool = True, interpolate_pos_encoding: bool = False
+    ):
         """
         Construct the final input, including position encoding.
 
@@ -3147,6 +3193,7 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
 
         """
         batch_size = inputs.shape[0]
+        input_size = inputs.shape[1:3]
         index_dims = inputs.shape[1:-1]
         indices = np.prod(index_dims)
 
@@ -3156,7 +3203,7 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
 
         # Construct the position encoding.
         if self.position_encoding_type == "trainable":
-            pos_enc = self.position_embeddings(batch_size)
+            pos_enc = self.position_embeddings(batch_size, interpolate_pos_encoding, input_size)
         elif self.position_encoding_type == "fourier":
             pos_enc = self.position_embeddings(index_dims, batch_size, device=inputs.device, dtype=inputs.dtype)
 
@@ -3174,7 +3221,13 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
             inputs_with_pos = inputs + pos_enc
         return inputs_with_pos, inputs
 
-    def forward(self, inputs: torch.Tensor, pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True):
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        pos: Optional[torch.Tensor] = None,
+        network_input_is_1d: bool = True,
+        interpolate_pos_encoding: bool = False,
+    ):
         if self.prep_type == "conv":
             # Convnet image featurization.
             # Downsamples spatially by a factor of 4
@@ -3218,7 +3271,7 @@ class PerceiverImagePreprocessor(AbstractPreprocessor):
             else:
                 raise ValueError("Unsupported data format for conv1x1.")
 
-        inputs, inputs_without_pos = self._build_network_inputs(inputs, network_input_is_1d)
+        inputs, inputs_without_pos = self._build_network_inputs(inputs, network_input_is_1d, interpolate_pos_encoding)
         modality_sizes = None  # Size for each modality, only needed for multimodal
 
         return inputs, modality_sizes, inputs_without_pos
@@ -3338,7 +3391,13 @@ class PerceiverAudioPreprocessor(AbstractPreprocessor):
 
         return inputs_with_pos, inputs
 
-    def forward(self, inputs: torch.Tensor, pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True):
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        pos: Optional[torch.Tensor] = None,
+        network_input_is_1d: bool = True,
+        interpolate_pos_encoding: bool = False,
+    ):
         inputs = torch.reshape(inputs, [inputs.shape[0], -1, self.samples_per_patch])
 
         inputs, inputs_without_pos = self._build_network_inputs(inputs)
@@ -3391,7 +3450,11 @@ class PerceiverMultimodalPreprocessor(AbstractPreprocessor):
         return common_channel_size
 
     def forward(
-        self, inputs: Mapping[str, torch.Tensor], pos: Optional[torch.Tensor] = None, network_input_is_1d: bool = True
+        self,
+        inputs: Mapping[str, torch.Tensor],
+        pos: Optional[torch.Tensor] = None,
+        network_input_is_1d: bool = True,
+        interpolate_pos_encoding: bool = False,
     ) -> PreprocessorOutputType:
         padded = {}
         modality_sizes = {}
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 530906388f..379b4774ca 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -1031,3 +1031,23 @@ class PerceiverModelIntegrationTest(unittest.TestCase):
         )
 
         self.assertTrue(torch.allclose(logits[0, :3, :3, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        image_processor = PerceiverImageProcessor(size={"height": 384, "width": 384})
+        model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
+        model.to(torch_device)
+
+        # prepare inputs
+        image = prepare_img()
+        inputs = image_processor(image, return_tensors="pt").pixel_values.to(torch_device)
+        input_mask = None
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(inputs=inputs, attention_mask=input_mask, interpolate_pos_encoding=True)
+        logits = outputs.logits
+
+        # verify logits
+        expected_shape = torch.Size((1, model.config.num_labels))
+        self.assertEqual(logits.shape, expected_shape)

From 04c7c176d7f70ec4b43c8c2a0327ff8d193f5c1d Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Fri, 24 May 2024 18:51:51 +0800
Subject: [PATCH 65/80] [tests] make `test_model_parallelism` device-agnostic  
 (#30844)

* enable on xpu

* fix style

* add comment and mps
---
 tests/test_modeling_common.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 20f5cf1ca2..30010cde91 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -76,6 +76,7 @@ from transformers.testing_utils import (
     require_safetensors,
     require_torch,
     require_torch_gpu,
+    require_torch_multi_accelerator,
     require_torch_multi_gpu,
     require_torch_sdpa,
     slow,
@@ -3009,8 +3010,11 @@ class ModelTesterMixin:
             param_device = device_map[param_name]
             if param_device in ["cpu", "disk"]:
                 self.assertEqual(param.device, torch.device("meta"))
+            elif param_device in ["mps"]:
+                self.assertEqual(param.device, torch.device("mps"))
             else:
-                self.assertEqual(param.device, torch.device(param_device))
+                # when loaded with device_map, `param_device` are integer values for cuda/xpu/npu/mlu
+                self.assertEqual(param.device, torch.device(f"{torch_device}:{param_device}"))
 
     @require_accelerate
     @mark.accelerate_tests
@@ -3129,7 +3133,7 @@ class ModelTesterMixin:
 
     @require_accelerate
     @mark.accelerate_tests
-    @require_torch_multi_gpu
+    @require_torch_multi_accelerator
     def test_model_parallelism(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3155,7 +3159,6 @@ class ModelTesterMixin:
                     new_model = model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
                     # Making sure part of the model will actually end up offloaded
                     self.assertSetEqual(set(new_model.hf_device_map.values()), {0, 1})
-
                     self.check_device_map_is_respected(new_model, new_model.hf_device_map)
 
                     torch.manual_seed(0)

From ae87f9797bd387b5d9d9501f05b1a0ac8161256a Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Fri, 24 May 2024 14:06:31 +0200
Subject: [PATCH 66/80] FIX / TST: Fix expected results on Mistral AWQ test 
 (#30971)

fix awq mistral test
---
 tests/quantization/autoawq/test_awq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
index 20ecd783cf..aaab7226c8 100644
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -288,7 +288,7 @@ class AwqFusedTest(unittest.TestCase):
         "You end up exactly where you started. Where are you?"
     )
 
-    EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf"
+    EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for"
     EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20"
     EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe"
 

From acbfaf69ccca28efbde397ec55ce9f2c4ee8b509 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 24 May 2024 14:20:06 +0200
Subject: [PATCH 67/80] allow multi-gpu (#31011)

* allow multi-gpu

* allow multi-gpu

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .github/workflows/ssh-runner.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml
index 5ea83f2005..ab87b2d5fa 100644
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@@ -9,6 +9,9 @@ on:
       docker_image:
         description: 'Name of the Docker image'
         required: true
+      num_gpus:
+        description: 'Type of the number of gpus to use (`single` or `multi`)'
+        required: true
 
 env:
   IS_GITHUB_CI: "1"
@@ -20,12 +23,13 @@ env:
   RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. 
   SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} 
   TF_FORCE_GPU_ALLOW_GROWTH: true 
+  CUDA_VISIBLE_DEVICES: 0,1
   RUN_PT_TF_CROSS_TESTS: 1
 
 jobs:
   ssh_runner:
     name: "SSH"
-    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
+    runs-on: ["${{ github.event.inputs.num_gpus }}-gpu", nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
     container:
       image: ${{ github.event.inputs.docker_image }}
       options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

From fd3c128040561a9f67abaf6af30f8b50029ff083 Mon Sep 17 00:00:00 2001
From: Lucain <lucainp@gmail.com>
Date: Fri, 24 May 2024 14:35:40 +0200
Subject: [PATCH 68/80] Fix resume_download future warning (#31007)

* Fix resume_download future warning

* better like this

* Add regression test
---
 src/transformers/configuration_utils.py | 2 +-
 tests/test_configuration_utils.py       | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index b3d76aff9c..5a40832b34 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -648,7 +648,7 @@ class PretrainedConfig(PushToHubMixin):
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
+        resume_download = kwargs.pop("resume_download", None)
         proxies = kwargs.pop("proxies", None)
         token = kwargs.pop("token", None)
         local_files_only = kwargs.pop("local_files_only", False)
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
index 9b1c6c25b3..a5322a176e 100644
--- a/tests/test_configuration_utils.py
+++ b/tests/test_configuration_utils.py
@@ -20,6 +20,7 @@ import sys
 import tempfile
 import unittest
 import unittest.mock as mock
+import warnings
 from pathlib import Path
 
 from huggingface_hub import HfFolder, delete_repo
@@ -306,3 +307,10 @@ class ConfigTestUtils(unittest.TestCase):
         self.assertTrue(config._has_non_default_generation_parameters())
         config = BertConfig(min_length=0)  # `min_length = 0` is a default generation kwarg
         self.assertFalse(config._has_non_default_generation_parameters())
+
+    def test_loading_config_do_not_raise_future_warnings(self):
+        """Regression test for https://github.com/huggingface/transformers/issues/31002."""
+        # Loading config should not raise a FutureWarning. It was the case before.
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            PretrainedConfig.from_pretrained("bert-base-uncased")

From 658b849aeb0ae56de47aab259f76d3fe075075e5 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Fri, 24 May 2024 14:35:59 +0200
Subject: [PATCH 69/80] Quantization / TST: Fix remaining quantization tests
 (#31000)

* Fix remaining quant tests

* Update test_quanto.py
---
 docker/transformers-all-latest-gpu/Dockerfile          | 3 ---
 docker/transformers-quantization-latest-gpu/Dockerfile | 3 +++
 tests/quantization/quanto_integration/test_quanto.py   | 3 ++-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 930fdfb799..b888397f95 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -45,9 +45,6 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt
 # For video model testing
 RUN python3 -m pip install --no-cache-dir decord av==9.2.0
 
-# For GGUF tests
-RUN python3 -m pip install --no-cache-dir gguf
-
 # Some slow tests require bnb
 RUN python3 -m pip install --no-cache-dir bitsandbytes
 
diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
index 2b74dca91f..6d94dbee5a 100755
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -48,6 +48,9 @@ RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
 # Add hqq for quantization testing
 RUN python3 -m pip install --no-cache-dir hqq
 
+# For GGUF tests
+RUN python3 -m pip install --no-cache-dir gguf
+
 # Add autoawq for quantization testing
 # >=v0.2.3 needed for compatibility with torch 2.2.1
 RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index f574478241..e662300a46 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -440,6 +440,7 @@ class QuantoQuantizationActivationTest(unittest.TestCase):
         self.assertIn("We don't support quantizing the activations with transformers library", str(e.exception))
 
 
+@require_quanto
 @require_torch_gpu
 class QuantoKVCacheQuantizationTest(unittest.TestCase):
     @slow
@@ -447,7 +448,7 @@ class QuantoKVCacheQuantizationTest(unittest.TestCase):
     def test_quantized_cache(self):
         EXPECTED_TEXT_COMPLETION = [
             "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory of relativity",
-            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my burgers, my hot dogs, my sandwiches, my chicken, my pizza, my sal",
+            "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my burgers, my hot dogs, my sandwiches, my salads, my chicken, my fish",
         ]
 
         prompts = [

From a3cdff417bb3db94d5402dc009a07df97954a863 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Fri, 24 May 2024 15:20:25 +0200
Subject: [PATCH 70/80] save the list of new model failures (#31013)

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 utils/notification_service.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/utils/notification_service.py b/utils/notification_service.py
index 83b3244f92..c57f9cdf89 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -636,7 +636,7 @@ class Message:
             {"type": "section", "text": {"type": "mrkdwn", "text": failure_text}},
         ]
 
-    def get_new_model_failure_blocks(self, with_header=True):
+    def get_new_model_failure_blocks(self, with_header=True, to_truncate=True):
         if self.prev_ci_artifacts is None:
             return {}
 
@@ -675,6 +675,8 @@ class Message:
                         all_failure_lines[new_text].append(f"<{url}|{device}>" if url is not None else device)
 
         MAX_ERROR_TEXT = 3000 - len("[Truncated]") - len("```New model failures```\n\n")
+        if not to_truncate:
+            MAX_ERROR_TEXT = float("inf")
         failure_text = ""
         for line, devices in all_failure_lines.items():
             new_text = failure_text + f"{'|'.join(devices)} gpu\n{line}"
@@ -760,6 +762,13 @@ class Message:
 
             time.sleep(1)
 
+        # To save the list of new model failures
+        blocks = self.get_new_model_failure_blocks(to_truncate=False)
+        failure_text = blocks[-1]["text"]["text"]
+        file_path = os.path.join(os.getcwd(), f"ci_results_{job_name}/new_model_failures.txt")
+        with open(file_path, "w", encoding="UTF-8") as fp:
+            fp.write(failure_text)
+
 
 def retrieve_artifact(artifact_path: str, gpu: Optional[str]):
     if gpu not in [None, "single", "multi"]:

From e5103a76cc57f4944456d2bca4d59eade3a514f6 Mon Sep 17 00:00:00 2001
From: BHUVAN M <121122109+bhuvanmdev@users.noreply.github.com>
Date: Fri, 24 May 2024 20:50:09 +0530
Subject: [PATCH 71/80] added interpolation for vitmae model in pytorch as well
 as tf. (#30732)

* added interpolation for vitmae model in pytorch as well as tf.

* Update modeling_vit_mae.py

irreugalr import fixed

* small changes and proper formatting

* changes suggested in review.

* modified decoder interpolate_func

* arguments and docstring fix

* Apply suggestions from code review

doc fixes

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 .../models/vit_mae/modeling_tf_vit_mae.py     | 164 ++++++++++++++----
 .../models/vit_mae/modeling_vit_mae.py        | 164 +++++++++++++++---
 .../vit_mae/test_modeling_tf_vit_mae.py       |  31 +++-
 tests/models/vit_mae/test_modeling_vit_mae.py |  34 +++-
 4 files changed, 333 insertions(+), 60 deletions(-)

diff --git a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
index cc7a29b192..5760dbf1ef 100644
--- a/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -240,6 +240,38 @@ class TFViTMAEEmbeddings(keras.layers.Layer):
             with tf.name_scope(self.patch_embeddings.name):
                 self.patch_embeddings.build(None)
 
+    def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        batch_size, seq_len, dim = shape_list(embeddings)
+        num_patches = seq_len - 1
+
+        _, num_positions, _ = shape_list(self.position_embeddings)
+        num_positions -= 1
+
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        patch_pos_embed = tf.image.resize(
+            images=tf.reshape(
+                patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+            ),
+            size=(h0, w0),
+            method="bicubic",
+        )
+
+        patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
+        return tf.concat(values=(class_pos_embed, patch_pos_embed), axis=1)
+
     def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None):
         """
         Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
@@ -281,17 +313,23 @@ class TFViTMAEEmbeddings(keras.layers.Layer):
 
         return sequence_unmasked, mask, ids_restore
 
-    def call(self, pixel_values: tf.Tensor, noise: tf.Tensor = None) -> tf.Tensor:
-        embeddings = self.patch_embeddings(pixel_values)
-
+    def call(
+        self, pixel_values: tf.Tensor, noise: tf.Tensor = None, interpolate_pos_encoding: bool = False
+    ) -> tf.Tensor:
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        if interpolate_pos_encoding:
+            position_embeddings = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            position_embeddings = self.position_embeddings
         # add position embeddings w/o cls token
-        embeddings = embeddings + self.position_embeddings[:, 1:, :]
+        embeddings = embeddings + position_embeddings[:, 1:, :]
 
         # masking: length -> length * config.mask_ratio
         embeddings, mask, ids_restore = self.random_masking(embeddings, noise)
 
         # append cls token
-        cls_token = self.cls_token + self.position_embeddings[:, :1, :]
+        cls_token = self.cls_token + position_embeddings[:, :1, :]
         cls_tokens = tf.tile(cls_token, (shape_list(embeddings)[0], 1, 1))
         embeddings = tf.concat([cls_tokens, embeddings], axis=1)
 
@@ -329,7 +367,9 @@ class TFViTMAEPatchEmbeddings(keras.layers.Layer):
             name="projection",
         )
 
-    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+    def call(
+        self, pixel_values: tf.Tensor, training: bool = False, interpolate_pos_encoding: bool = False
+    ) -> tf.Tensor:
         batch_size, num_channels, height, width = shape_list(pixel_values)
         if tf.executing_eagerly():
             if num_channels != self.num_channels:
@@ -337,7 +377,7 @@ class TFViTMAEPatchEmbeddings(keras.layers.Layer):
                     "Make sure that the channel dimension of the pixel values match with the one set in the"
                     " configuration."
                 )
-            if height != self.image_size[0] or width != self.image_size[1]:
+            if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
                 raise ValueError(
                     f"Input image size ({height}*{width}) doesn't match model"
                     f" ({self.image_size[0]}*{self.image_size[1]})."
@@ -741,9 +781,13 @@ class TFViTMAEMainLayer(keras.layers.Layer):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
+        interpolate_pos_encoding: bool = False,
     ) -> Union[TFViTMAEModelOutput, Tuple[tf.Tensor]]:
         embedding_output, mask, ids_restore = self.embeddings(
-            pixel_values=pixel_values, training=training, noise=noise
+            pixel_values=pixel_values,
+            training=training,
+            noise=noise,
+            interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
         # Prepare head mask if needed
@@ -874,6 +918,9 @@ VIT_MAE_INPUTS_DOCSTRING = r"""
         training (`bool`, *optional*, defaults to `False``):
             Whether or not to use the model in training mode (some modules like dropout modules have different
             behaviors between training and evaluation).
+
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the position encodings at the encoder and decoder.
 """
 
 
@@ -902,6 +949,7 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
+        interpolate_pos_encoding: bool = False,
     ) -> Union[TFViTMAEModelOutput, Tuple[tf.Tensor]]:
         r"""
         Returns:
@@ -931,6 +979,7 @@ class TFViTMAEModel(TFViTMAEPreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             training=training,
+            interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
         return outputs
@@ -1004,6 +1053,39 @@ class TFViTMAEDecoder(keras.layers.Layer):
                 with tf.name_scope(layer.name):
                     layer.build(None)
 
+    def interpolate_pos_encoding(self, embeddings) -> tf.Tensor:
+        """
+        This method is a modified version of the interpolation function for ViT-mae model at the deocder, that
+        allows to interpolate the pre-trained decoder position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        # [batch_size, num_patches + 1, hidden_size]
+        _, num_positions, dim = shape_list(self.decoder_pos_embed)
+
+        # -1 removes the class dimension since we later append it without interpolation
+        seq_len = shape_list(embeddings)[1] - 1
+        num_positions = num_positions - 1
+
+        # Separation of class token and patch tokens
+        class_pos_embed = self.decoder_pos_embed[:, :1, :]
+        patch_pos_embed = self.decoder_pos_embed[:, 1:, :]
+
+        # interpolate the position embeddings
+        patch_pos_embed = tf.image.resize(
+            images=tf.reshape(patch_pos_embed, shape=(1, 1, -1, dim)),
+            size=(1, seq_len),
+            method="bicubic",
+        )
+
+        # [1, seq_len, hidden_size]
+        patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
+        # Adding the class token back
+        return tf.concat(values=(class_pos_embed, patch_pos_embed), axis=1)
+
     def call(
         self,
         hidden_states,
@@ -1011,10 +1093,10 @@ class TFViTMAEDecoder(keras.layers.Layer):
         output_attentions=False,
         output_hidden_states=False,
         return_dict=True,
+        interpolate_pos_encoding=False,
     ):
         # embed tokens
         x = self.decoder_embed(hidden_states)
-
         # append mask tokens to sequence
         mask_tokens = tf.tile(
             self.mask_token,
@@ -1023,10 +1105,12 @@ class TFViTMAEDecoder(keras.layers.Layer):
         x_ = tf.concat([x[:, 1:, :], mask_tokens], axis=1)  # no cls token
         x_ = tf.gather(x_, axis=1, batch_dims=1, indices=ids_restore)  # unshuffle
         x = tf.concat([x[:, :1, :], x_], axis=1)  # append cls token
-
+        if interpolate_pos_encoding:
+            decoder_pos_embed = self.interpolate_pos_encoding(x)
+        else:
+            decoder_pos_embed = self.decoder_pos_embed
         # add pos embed
-        hidden_states = x + self.decoder_pos_embed
-
+        hidden_states = x + decoder_pos_embed
         # apply Transformer layers (blocks)
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -1083,11 +1167,13 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError
 
-    def patchify(self, pixel_values):
+    def patchify(self, pixel_values, interpolate_pos_encoding: bool = False):
         """
         Args:
             pixel_values (`tf.Tensor` of shape `(batch_size, height, width, num_channels)` or `(batch_size, num_channels, height, width)`):
                 Pixel values.
+            interpolate_pos_encoding (`bool`, default `False`):
+                interpolation flag passed during the forward pass.
 
         Returns:
             `tf.Tensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
@@ -1099,11 +1185,12 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
             pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
 
         # sanity checks
-        tf.debugging.assert_equal(
-            shape_list(pixel_values)[1],
-            shape_list(pixel_values)[2],
-            message="Make sure the pixel values have a squared size",
-        )
+        if not interpolate_pos_encoding:
+            tf.debugging.assert_equal(
+                shape_list(pixel_values)[1],
+                shape_list(pixel_values)[2],
+                message="Make sure the pixel values have a squared size",
+            )
         tf.debugging.assert_equal(
             shape_list(pixel_values)[1] % patch_size,
             0,
@@ -1119,51 +1206,61 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
 
         # patchify
         batch_size = shape_list(pixel_values)[0]
-        num_patches_one_direction = shape_list(pixel_values)[2] // patch_size
+        num_patches_h = shape_list(pixel_values)[1] // patch_size
+        num_patches_w = shape_list(pixel_values)[2] // patch_size
         patchified_pixel_values = tf.reshape(
             pixel_values,
-            (batch_size, num_patches_one_direction, patch_size, num_patches_one_direction, patch_size, num_channels),
+            (batch_size, num_patches_h, patch_size, num_patches_w, patch_size, num_channels),
         )
         patchified_pixel_values = tf.einsum("nhpwqc->nhwpqc", patchified_pixel_values)
         patchified_pixel_values = tf.reshape(
             patchified_pixel_values,
-            (batch_size, num_patches_one_direction * num_patches_one_direction, patch_size**2 * num_channels),
+            (batch_size, num_patches_h * num_patches_w, patch_size**2 * num_channels),
         )
         return patchified_pixel_values
 
-    def unpatchify(self, patchified_pixel_values):
+    def unpatchify(self, patchified_pixel_values, original_image_size: Optional[Tuple[int, int]] = None):
         """
         Args:
             patchified_pixel_values (`tf.Tensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
                 Patchified pixel values.
+            original_image_size (`Tuple[int, int]`, *optional*):
+                Original image size.
 
         Returns:
             `tf.Tensor` of shape `(batch_size, height, width, num_channels)`:
                 Pixel values.
         """
         patch_size, num_channels = self.config.patch_size, self.config.num_channels
-        num_patches_one_direction = int(shape_list(patchified_pixel_values)[1] ** 0.5)
+        original_image_size = (
+            original_image_size
+            if original_image_size is not None
+            else (self.config.image_size, self.config.image_size)
+        )
+        original_height, original_width = original_image_size
+        num_patches_h = original_height // patch_size
+        num_patches_w = original_width // patch_size
         # sanity check
         tf.debugging.assert_equal(
-            num_patches_one_direction * num_patches_one_direction,
+            num_patches_h * num_patches_w,
             shape_list(patchified_pixel_values)[1],
-            message="Make sure that the number of patches can be squared",
+            message=f"The number of patches in the patchified pixel values is {shape_list(patchified_pixel_values)[1]} does not match the patches of original image {num_patches_w}*{num_patches_h}",
         )
 
         # unpatchify
         batch_size = shape_list(patchified_pixel_values)[0]
         patchified_pixel_values = tf.reshape(
             patchified_pixel_values,
-            (batch_size, num_patches_one_direction, num_patches_one_direction, patch_size, patch_size, num_channels),
+            (batch_size, num_patches_h, num_patches_w, patch_size, patch_size, num_channels),
         )
         patchified_pixel_values = tf.einsum("nhwpqc->nhpwqc", patchified_pixel_values)
         pixel_values = tf.reshape(
             patchified_pixel_values,
-            (batch_size, num_patches_one_direction * patch_size, num_patches_one_direction * patch_size, num_channels),
+            (batch_size, num_patches_h * patch_size, num_patches_w * patch_size, num_channels),
         )
         return pixel_values
 
-    def forward_loss(self, pixel_values, pred, mask):
+    def forward_loss(self, pixel_values, pred, mask, interpolate_pos_encoding: bool = False):
         """
         Args:
             pixel_values (`tf.Tensor` of shape `(batch_size, height, width, num_channels)`):
@@ -1172,11 +1269,13 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
                 Predicted pixel values.
             mask (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                 Tensor indicating which patches are masked (1) and which are not (0).
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
 
         Returns:
             `tf.Tensor`: Pixel reconstruction loss.
         """
-        target = self.patchify(pixel_values)
+        target = self.patchify(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
         if self.config.norm_pix_loss:
             mean = tf.reduce_mean(target, axis=-1, keepdims=True)
             var = tf.math.reduce_variance(target, axis=-1, keepdims=True)
@@ -1201,6 +1300,7 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
+        interpolate_pos_encoding: bool = False,
     ) -> Union[TFViTMAEForPreTrainingOutput, Tuple[tf.Tensor]]:
         r"""
         Returns:
@@ -1234,16 +1334,18 @@ class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             training=training,
+            interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
         latent = outputs.last_hidden_state
         ids_restore = outputs.ids_restore
         mask = outputs.mask
 
-        decoder_outputs = self.decoder(latent, ids_restore)  # [batch_size, num_patches, patch_size**2*3]
+        # [batch_size, num_patches, patch_size**2*3]
+        decoder_outputs = self.decoder(latent, ids_restore, interpolate_pos_encoding=interpolate_pos_encoding)
         logits = decoder_outputs.logits
 
-        loss = self.forward_loss(pixel_values, logits, mask)
+        loss = self.forward_loss(pixel_values, logits, mask, interpolate_pos_encoding=interpolate_pos_encoding)
 
         if not return_dict:
             output = (logits, mask, ids_restore) + outputs[2:]
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index f2afd84fc1..e85d996f47 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -223,6 +223,41 @@ class ViTMAEEmbeddings(nn.Module):
         # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
         torch.nn.init.normal_(self.cls_token, std=self.config.initializer_range)
 
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, 0, :]
+        patch_pos_embed = self.position_embeddings[:, 1:, :]
+        dim = embeddings.shape[-1]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        h0, w0 = h0 + 0.1, w0 + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if int(h0) != patch_pos_embed.shape[-2] or int(w0) != patch_pos_embed.shape[-1]:
+            raise ValueError("Width or height does not match with the interpolated position embeddings")
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
     def random_masking(self, sequence, noise=None):
         """
         Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
@@ -255,18 +290,22 @@ class ViTMAEEmbeddings(nn.Module):
 
         return sequence_unmasked, mask, ids_restore
 
-    def forward(self, pixel_values, noise=None):
+    def forward(self, pixel_values, noise=None, interpolate_pos_encoding: bool = False):
         batch_size, num_channels, height, width = pixel_values.shape
-        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        if interpolate_pos_encoding:
+            position_embeddings = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            position_embeddings = self.position_embeddings
 
         # add position embeddings w/o cls token
-        embeddings = embeddings + self.position_embeddings[:, 1:, :]
+        embeddings = embeddings + position_embeddings[:, 1:, :]
 
         # masking: length -> length * config.mask_ratio
         embeddings, mask, ids_restore = self.random_masking(embeddings, noise)
 
         # append cls token
-        cls_token = self.cls_token + self.position_embeddings[:, :1, :]
+        cls_token = self.cls_token + position_embeddings[:, :1, :]
         cls_tokens = cls_token.expand(embeddings.shape[0], -1, -1)
         embeddings = torch.cat((cls_tokens, embeddings), dim=1)
 
@@ -294,13 +333,14 @@ class ViTMAEPatchEmbeddings(nn.Module):
 
         self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
 
-    def forward(self, pixel_values):
+    def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
         batch_size, num_channels, height, width = pixel_values.shape
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
-        if height != self.image_size[0] or width != self.image_size[1]:
+
+        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
             raise ValueError(
                 f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
             )
@@ -657,6 +697,9 @@ VIT_MAE_INPUTS_DOCSTRING = r"""
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        interpolate_pos_encoding (`bool`, *optional*, default `False`):
+            Whether to interpolate the pre-trained position encodings. This is mainly used to use the model on higher
+            resolution images.
 """
 
 
@@ -698,6 +741,7 @@ class ViTMAEModel(ViTMAEPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
     ) -> Union[Tuple, ViTMAEModelOutput]:
         r"""
         Returns:
@@ -735,7 +779,9 @@ class ViTMAEModel(ViTMAEPreTrainedModel):
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output, mask, ids_restore = self.embeddings(pixel_values, noise=noise)
+        embedding_output, mask, ids_restore = self.embeddings(
+            pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
+        )
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -785,6 +831,47 @@ class ViTMAEDecoder(nn.Module):
         self.config = config
         self.initialize_weights(num_patches)
 
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """
+        This method is a modified version of the interpolation function for ViT-mae model at the deocder, that
+        allows to interpolate the pre-trained decoder position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        # -1 removes the class dimension since we later append it without interpolation
+        embeddings_positions = embeddings.shape[1] - 1
+        num_positions = self.decoder_pos_embed.shape[1] - 1
+
+        # Separation of class token and patch tokens
+        class_pos_embed = self.decoder_pos_embed[:, 0, :]
+        patch_pos_embed = self.decoder_pos_embed[:, 1:, :]
+
+        # To retain the final 3d tensor with the required dimensions
+        dim = self.decoder_pos_embed.shape[-1]
+
+        # Increasing a dimension to enable bicubic interpolation
+        patch_pos_embed = patch_pos_embed.reshape(1, 1, -1, dim)
+
+        # permute to bring the dimension to be interpolated, to the last
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Interpolating the decoder position embeddings shape wrt embeddings shape i.e (x).
+        # 1 keeps the other dimension constant
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(1, embeddings_positions / num_positions),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        # Converting back to the original shape
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        # Adding the class token back
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
     def initialize_weights(self, num_patches):
         # initialize (and freeze) position embeddings by sin-cos embedding
         decoder_pos_embed = get_2d_sincos_pos_embed(
@@ -802,6 +889,7 @@ class ViTMAEDecoder(nn.Module):
         output_attentions=False,
         output_hidden_states=False,
         return_dict=True,
+        interpolate_pos_encoding: bool = False,
     ):
         # embed tokens
         x = self.decoder_embed(hidden_states)
@@ -812,9 +900,12 @@ class ViTMAEDecoder(nn.Module):
         # unshuffle
         x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]).to(x_.device))
         x = torch.cat([x[:, :1, :], x_], dim=1)  # append cls token
-
         # add pos embed
-        hidden_states = x + self.decoder_pos_embed
+        if interpolate_pos_encoding:
+            decoder_pos_embed = self.interpolate_pos_encoding(x)
+        else:
+            decoder_pos_embed = self.decoder_pos_embed
+        hidden_states = x + decoder_pos_embed
 
         # apply Transformer layers (blocks)
         all_hidden_states = () if output_hidden_states else None
@@ -893,11 +984,13 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def patchify(self, pixel_values):
+    def patchify(self, pixel_values, interpolate_pos_encoding: bool = False):
         """
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                 Pixel values.
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
 
         Returns:
             `torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
@@ -905,7 +998,9 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
         """
         patch_size, num_channels = self.config.patch_size, self.config.num_channels
         # sanity checks
-        if (pixel_values.shape[2] != pixel_values.shape[3]) or (pixel_values.shape[2] % patch_size != 0):
+        if not interpolate_pos_encoding and (
+            pixel_values.shape[2] != pixel_values.shape[3] or pixel_values.shape[2] % patch_size != 0
+        ):
             raise ValueError("Make sure the pixel values have a squared size that is divisible by the patch size")
         if pixel_values.shape[1] != num_channels:
             raise ValueError(
@@ -914,38 +1009,50 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
 
         # patchify
         batch_size = pixel_values.shape[0]
-        num_patches_one_direction = pixel_values.shape[2] // patch_size
+        num_patches_h = pixel_values.shape[2] // patch_size
+        num_patches_w = pixel_values.shape[3] // patch_size
         patchified_pixel_values = pixel_values.reshape(
-            batch_size, num_channels, num_patches_one_direction, patch_size, num_patches_one_direction, patch_size
+            batch_size, num_channels, num_patches_h, patch_size, num_patches_w, patch_size
         )
         patchified_pixel_values = torch.einsum("nchpwq->nhwpqc", patchified_pixel_values)
         patchified_pixel_values = patchified_pixel_values.reshape(
-            batch_size, num_patches_one_direction * num_patches_one_direction, patch_size**2 * num_channels
+            batch_size, num_patches_h * num_patches_w, patch_size**2 * num_channels
         )
         return patchified_pixel_values
 
-    def unpatchify(self, patchified_pixel_values):
+    def unpatchify(self, patchified_pixel_values, original_image_size: Optional[Tuple[int, int]] = None):
         """
         Args:
             patchified_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
                 Patchified pixel values.
+            original_image_size (`Tuple[int, int]`, *optional*):
+                Original image size.
 
         Returns:
             `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
                 Pixel values.
         """
         patch_size, num_channels = self.config.patch_size, self.config.num_channels
-        num_patches_one_direction = int(patchified_pixel_values.shape[1] ** 0.5)
+        original_image_size = (
+            original_image_size
+            if original_image_size is not None
+            else (self.config.image_size, self.config.image_size)
+        )
+        original_height, original_width = original_image_size
+        num_patches_h = original_height // patch_size
+        num_patches_w = original_width // patch_size
         # sanity check
-        if num_patches_one_direction**2 != patchified_pixel_values.shape[1]:
-            raise ValueError("Make sure that the number of patches can be squared")
+        if num_patches_h * num_patches_w != patchified_pixel_values.shape[1]:
+            raise ValueError(
+                f"The number of patches in the patchified pixel values {patchified_pixel_values.shape[1]}, does not match the number of patches on original image {num_patches_h}*{num_patches_w}"
+            )
 
         # unpatchify
         batch_size = patchified_pixel_values.shape[0]
         patchified_pixel_values = patchified_pixel_values.reshape(
             batch_size,
-            num_patches_one_direction,
-            num_patches_one_direction,
+            num_patches_h,
+            num_patches_w,
             patch_size,
             patch_size,
             num_channels,
@@ -954,12 +1061,12 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
         pixel_values = patchified_pixel_values.reshape(
             batch_size,
             num_channels,
-            num_patches_one_direction * patch_size,
-            num_patches_one_direction * patch_size,
+            num_patches_h * patch_size,
+            num_patches_w * patch_size,
         )
         return pixel_values
 
-    def forward_loss(self, pixel_values, pred, mask):
+    def forward_loss(self, pixel_values, pred, mask, interpolate_pos_encoding: bool = False):
         """
         Args:
             pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
@@ -968,11 +1075,13 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
                 Predicted pixel values.
             mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                 Tensor indicating which patches are masked (1) and which are not (0).
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
 
         Returns:
             `torch.FloatTensor`: Pixel reconstruction loss.
         """
-        target = self.patchify(pixel_values)
+        target = self.patchify(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
         if self.config.norm_pix_loss:
             mean = target.mean(dim=-1, keepdim=True)
             var = target.var(dim=-1, keepdim=True)
@@ -980,7 +1089,6 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
 
         loss = (pred - target) ** 2
         loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
-
         loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
         return loss
 
@@ -994,6 +1102,7 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
     ) -> Union[Tuple, ViTMAEForPreTrainingOutput]:
         r"""
         Returns:
@@ -1026,16 +1135,17 @@ class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
         latent = outputs.last_hidden_state
         ids_restore = outputs.ids_restore
         mask = outputs.mask
 
-        decoder_outputs = self.decoder(latent, ids_restore)
+        decoder_outputs = self.decoder(latent, ids_restore, interpolate_pos_encoding=interpolate_pos_encoding)
         logits = decoder_outputs.logits  # shape (batch_size, num_patches, patch_size*patch_size*num_channels)
 
-        loss = self.forward_loss(pixel_values, logits, mask)
+        loss = self.forward_loss(pixel_values, logits, mask, interpolate_pos_encoding=interpolate_pos_encoding)
 
         if not return_dict:
             output = (logits, mask, ids_restore) + outputs[2:]
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 5c27e5ac80..5344c3187c 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -426,7 +426,7 @@ def prepare_img():
 class TFViTMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
+        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base")
 
     @slow
     def test_inference_for_pretraining(self):
@@ -457,3 +457,32 @@ class TFViTMAEModelIntegrationTest(unittest.TestCase):
         )
 
         tf.debugging.assert_near(outputs.logits[0, :3, :3], expected_slice, atol=1e-4)
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # ViTMAE models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+
+        # make random mask reproducible across the PT and TF model
+        np.random.seed(2)
+
+        model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, do_resize=False, return_tensors="tf")
+
+        # prepare a noise vector that will be also used for testing the TF model
+        # (this way we can ensure that the PT and TF models operate on the same inputs)
+        vit_mae_config = ViTMAEConfig()
+        num_patches = (image.height // vit_mae_config.patch_size) * (image.width // vit_mae_config.patch_size)
+        noise = np.random.uniform(size=(1, num_patches))
+
+        # forward pass
+        outputs = model(**inputs, noise=noise, interpolate_pos_encoding=True)
+
+        # verify the logits
+        expected_shape = tf.convert_to_tensor([1, 1200, 768])
+        self.assertEqual(outputs.logits.shape, expected_shape)
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 0357a3ebda..506660f089 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -296,7 +296,7 @@ def prepare_img():
 class ViTMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_image_processor(self):
-        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base") if is_vision_available() else None
+        return ViTImageProcessor.from_pretrained("facebook/vit-mae-base")
 
     @slow
     def test_inference_for_pretraining(self):
@@ -328,3 +328,35 @@ class ViTMAEModelIntegrationTest(unittest.TestCase):
         )
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice.to(torch_device), atol=1e-4))
+
+    @slow
+    def test_inference_interpolate_pos_encoding(self):
+        # ViTMAE models have an `interpolate_pos_encoding` argument in their forward method,
+        # allowing to interpolate the pre-trained position embeddings in order to use
+        # the model on higher resolutions. The DINO model by Facebook AI leverages this
+        # to visualize self-attention on higher resolution images.
+
+        # make random mask reproducible across the PT and TF model
+        np.random.seed(2)
+
+        model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        inputs = image_processor(images=image, return_tensors="pt", do_resize=False).to(torch_device)
+
+        # prepare a noise vector that will be also used for testing the TF model
+        # (this way we can ensure that the PT and TF models operate on the same inputs)
+        vit_mae_config = ViTMAEConfig()
+        num_patches = (image.height // vit_mae_config.patch_size) * (image.width // vit_mae_config.patch_size)
+        noise = np.random.uniform(size=(1, num_patches))
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(
+                **inputs, noise=torch.from_numpy(noise).to(device=torch_device), interpolate_pos_encoding=True
+            )
+
+        # verify the logits
+        expected_shape = torch.Size((1, 1200, 768))
+        self.assertEqual(outputs.logits.shape, expected_shape)

From deba7655e6e54fb885e79204dec9f767393dd2df Mon Sep 17 00:00:00 2001
From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com>
Date: Fri, 24 May 2024 17:38:58 +0200
Subject: [PATCH 72/80] Add split special tokens (#30772)

* seems like `split_special_tokens` is used here

* split special token

* add new line at end of file

* moving split special token test to common tests

* added assertions

* test

* fixup

* add co-author

* passing rest of args to gptsan_japanese, fixing tests

* removing direct comparison of fast and slow models

* adding test support for UDOP and LayoutXLM

* ruff fix

* readd check if slow tokenizer

* modify test to handle bos tokens

* removing commented function

* trigger build

* applying review feedback - updated docstrings, var names, and simplified tests

* ruff fixes

* Update tests/test_tokenization_common.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* applying feedback, comments

* shutil temp directory fix

---------

Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
Co-authored-by: Ita Zaporozhets <itazaporozhets@Itas-MBP.localdomain>
Co-authored-by: itazap <itazap@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: Ita Zaporozhets <itazaporozhets@Itas-MacBook-Pro.local>
---
 .../tokenization_gptsan_japanese.py           |  2 +
 .../layoutxlm/tokenization_layoutxlm_fast.py  |  5 ++
 .../models/udop/tokenization_udop_fast.py     |  5 ++
 src/transformers/tokenization_utils.py        |  4 +
 src/transformers/tokenization_utils_base.py   | 17 ++++-
 src/transformers/tokenization_utils_fast.py   |  9 +++
 .../layoutxlm/test_tokenization_layoutxlm.py  | 43 ++++++++---
 tests/models/udop/test_tokenization_udop.py   | 45 +++++++++++
 tests/test_tokenization_common.py             | 74 ++++++++++++-------
 9 files changed, 165 insertions(+), 39 deletions(-)

diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
index b21b8a6f23..56756f3c32 100644
--- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -353,6 +353,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        **kwargs,
     ) -> BatchEncoding:
         # This tokenizer converts input text pairs into Prefix input and subsequent input
         if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list):
@@ -379,6 +380,7 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
             return_offsets_mapping,
             return_length,
             verbose,
+            **kwargs,
         )
 
 
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index bd6533598d..6d68cb9f18 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -415,6 +415,11 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
 
     def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
         batched_input = [(text, pair)] if pair else [text]
+
+        self._tokenizer.encode_special_tokens = kwargs.pop(
+            "split_special_tokens", self._tokenizer.encode_special_tokens
+        )
+
         encodings = self._tokenizer.encode_batch(
             batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
         )
diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py
index caa96c25f4..a10bdb9084 100644
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ b/src/transformers/models/udop/tokenization_udop_fast.py
@@ -425,6 +425,11 @@ class UdopTokenizerFast(PreTrainedTokenizerFast):
     # Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast.tokenize
     def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
         batched_input = [(text, pair)] if pair else [text]
+
+        self._tokenizer.encode_special_tokens = kwargs.pop(
+            "split_special_tokens", self._tokenizer.encode_special_tokens
+        )
+
         encodings = self._tokenizer.encode_batch(
             batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
         )
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index f936bc25ad..b7c023d951 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -764,6 +764,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
         **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
@@ -820,6 +821,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
             return_length=return_length,
             return_tensors=return_tensors,
             verbose=verbose,
+            split_special_tokens=split_special_tokens,
         )
 
         return BatchEncoding(batch_outputs)
@@ -841,6 +843,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         return_special_tokens_mask: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
@@ -870,6 +873,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                 return_tensors=None,  # We convert the whole batch to tensors at the end
                 prepend_batch_axis=False,
                 verbose=verbose,
+                split_special_tokens=split_special_tokens,
             )
 
             for key, value in outputs.items():
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4cb75c9864..a8d3500328 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1538,10 +1538,10 @@ INIT_TOKENIZER_DOCSTRING = r"""
             Whether or not the model should cleanup the spaces that were added when splitting the input text during the
             tokenization process.
         split_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not the special tokens should be split during the tokenization process. The default behavior is
-            to not split special tokens. This means that if `<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") =
-            ['<s>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<',
-            's', '>']`. This argument is only supported for `slow` tokenizers for the moment.
+            Whether or not the special tokens should be split during the tokenization process. Passing will affect the
+            internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
+            `<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") = ['<s>`]. Otherwise, if
+            `split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<','s', '>']`.
 """
 
 
@@ -2876,6 +2876,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
             "return_special_tokens_mask": return_special_tokens_mask,
             "return_offsets_mapping": return_offsets_mapping,
             "return_length": return_length,
+            "split_special_tokens": kwargs.pop("split_special_tokens", self.split_special_tokens),
             "verbose": verbose,
         }
         all_kwargs.update(kwargs)
@@ -2920,6 +2921,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
         **kwargs,
     ) -> BatchEncoding:
         # Input type checking for clearer error
@@ -2989,6 +2991,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                 return_offsets_mapping=return_offsets_mapping,
                 return_length=return_length,
                 verbose=verbose,
+                split_special_tokens=split_special_tokens,
                 **kwargs,
             )
         else:
@@ -3010,6 +3013,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                 return_offsets_mapping=return_offsets_mapping,
                 return_length=return_length,
                 verbose=verbose,
+                split_special_tokens=split_special_tokens,
                 **kwargs,
             )
 
@@ -3083,6 +3087,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
             return_offsets_mapping=return_offsets_mapping,
             return_length=return_length,
             verbose=verbose,
+            split_special_tokens=kwargs.pop("split_special_tokens", self.split_special_tokens),
             **kwargs,
         )
 
@@ -3105,6 +3110,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
         **kwargs,
     ) -> BatchEncoding:
         raise NotImplementedError
@@ -3135,6 +3141,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
         **kwargs,
     ) -> BatchEncoding:
         """
@@ -3180,6 +3187,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
             return_offsets_mapping=return_offsets_mapping,
             return_length=return_length,
             verbose=verbose,
+            split_special_tokens=split_special_tokens,
             **kwargs,
         )
 
@@ -3208,6 +3216,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
         **kwargs,
     ) -> BatchEncoding:
         raise NotImplementedError
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 07e1ef3651..53f6852ec2 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -163,6 +163,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         # We call this after having initialized the backend tokenizer because we update it.
         super().__init__(**kwargs)
 
+        # Set the splitting mode for special tokens for the tokenizer to be used throughout the class.
+        self._tokenizer.encode_special_tokens = self.split_special_tokens
+
         # The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
         # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
         # uses the information stored in `added_tokens_decoder`.
@@ -494,6 +497,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
     ) -> BatchEncoding:
         if not isinstance(batch_text_or_text_pairs, (tuple, list)):
             raise TypeError(
@@ -509,6 +513,9 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
             pad_to_multiple_of=pad_to_multiple_of,
         )
 
+        if self._tokenizer.encode_special_tokens != split_special_tokens:
+            self._tokenizer.encode_special_tokens = split_special_tokens
+
         encodings = self._tokenizer.encode_batch(
             batch_text_or_text_pairs,
             add_special_tokens=add_special_tokens,
@@ -578,6 +585,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
+        split_special_tokens: bool = False,
         **kwargs,
     ) -> BatchEncoding:
         batched_input = [(text, text_pair)] if text_pair else [text]
@@ -598,6 +606,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
             return_offsets_mapping=return_offsets_mapping,
             return_length=return_length,
             verbose=verbose,
+            split_special_tokens=split_special_tokens,
             **kwargs,
         )
 
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index bf8e6be498..03f2bf414b 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -150,17 +150,40 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
 
     def test_split_special_tokens(self):
-        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")
-        _, _, boxes = self.get_question_words_and_boxes()
-        special_token = "[SPECIAL_TOKEN]"
-        tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
-        encoded_special_token = tokenizer.tokenize(special_token, boxes=boxes, add_special_tokens=False)
-        self.assertEqual(len(encoded_special_token), 1)
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            special_token = "<my_new_token>"
+            special_sentence = f"Hey this is a {special_token} token"
+            _, _, boxes = self.get_question_words_and_boxes()
 
-        encoded_split_special_token = tokenizer.tokenize(
-            special_token, add_special_tokens=False, split_special_tokens=True, boxes=boxes
-        )
-        self.assertTrue(len(encoded_split_special_token) > 1)
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
+                )
+                tokenizer_py = self.tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
+                )
+
+                py_tokens_output = tokenizer_py.tokenize(special_sentence)
+                rust_tokens_output = tokenizer_rust.tokenize(special_sentence)
+
+                self.assertTrue(special_token not in py_tokens_output)
+                self.assertTrue(special_token not in rust_tokens_output)
+
+                py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False)
+                rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False)
+
+                self.assertTrue(special_token in py_tokens_output_unsplit)
+                self.assertTrue(special_token in rust_tokens_output_unsplit)
+
+                tmpdirname = tempfile.mkdtemp()
+                tokenizer_py.save_pretrained(tmpdirname)
+                fast_from_saved = self.tokenizer_class.from_pretrained(tmpdirname)
+
+                output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence)
+                self.assertTrue(special_token not in output_tokens_reloaded_split)
+
+                output_tokens_reloaded_unsplit = fast_from_saved.tokenize(special_sentence, split_special_tokens=False)
+                self.assertTrue(special_token in output_tokens_reloaded_unsplit)
 
     @slow
     def test_sequence_builders(self):
diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py
index 8bea9880b0..2f165d349e 100644
--- a/tests/models/udop/test_tokenization_udop.py
+++ b/tests/models/udop/test_tokenization_udop.py
@@ -1921,3 +1921,48 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
 
         excepted_decoding = "<pad> paragraph<loc_58><loc_34><loc_446><loc_449></s>"
         assert decoding == excepted_decoding
+
+    def test_split_special_tokens(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            special_token = "<my_new_token>"
+            special_sentence = f"Hey this is a {special_token} token"
+            _, _, boxes = self.get_question_words_and_boxes()
+
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
+                )
+                tokenizer_py = self.tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
+                )
+
+                special_token_id = tokenizer_py.convert_tokens_to_ids(special_token)
+                encoded_special_token_unsplit = tokenizer_py.encode(
+                    special_token, add_special_tokens=False, split_special_tokens=False
+                )
+                self.assertTrue(special_token_id in encoded_special_token_unsplit)
+
+                encoded_special_token_split = tokenizer_py.encode(special_token, add_special_tokens=False)
+                self.assertTrue(special_token_id not in encoded_special_token_split)
+
+                py_tokens_output = tokenizer_py.tokenize(special_sentence)
+                rust_tokens_output = tokenizer_rust.tokenize(special_sentence)
+
+                self.assertTrue(special_token not in py_tokens_output)
+                self.assertTrue(special_token not in rust_tokens_output)
+
+                py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False)
+                rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False)
+
+                self.assertTrue(special_token in py_tokens_output_unsplit)
+                self.assertTrue(special_token in rust_tokens_output_unsplit)
+
+                tmpdirname = tempfile.mkdtemp()
+                tokenizer_py.save_pretrained(tmpdirname)
+                fast_from_saved = self.tokenizer_class.from_pretrained(tmpdirname)
+
+                output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence)
+                self.assertTrue(special_token not in output_tokens_reloaded_split)
+
+                output_tokens_reloaded_unsplit = fast_from_saved.tokenize(special_sentence, split_special_tokens=False)
+                self.assertTrue(special_token in output_tokens_reloaded_unsplit)
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 56814b74f5..8b0ad38795 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import inspect
 import itertools
 import json
@@ -4168,34 +4167,59 @@ class TokenizerTesterMixin:
     def test_split_special_tokens(self):
         if not self.test_slow_tokenizer:
             return
-
+        # Tests the expected appearance (or absence) of special token in encoded output,
+        # explicit values are not tested because tokenization is model dependent and can change
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            special_token = "[SPECIAL_TOKEN]"
+            special_token = "<my_new_token>"
+            special_sentence = f"Hey this is a {special_token} token"
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+                tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
+                )
+                tokenizer_py = self.tokenizer_class.from_pretrained(
+                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
+                )
 
-                if not tokenizer.is_fast:
-                    # bloom, gptneox etc only have a fast
-                    tokenizer.add_special_tokens(
-                        {
-                            "additional_special_tokens": [
-                                AddedToken(special_token, rstrip=True, lstrip=True, normalized=True, special=True)
-                            ]
-                        }
-                    )
-                    encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
-                    self.assertEqual(len(encoded_special_token), 1)
+                special_token_id = tokenizer_py.convert_tokens_to_ids(special_token)
+                encoded_special_token_unsplit = tokenizer_py.encode(
+                    special_token, add_special_tokens=False, split_special_tokens=False
+                )
+                self.assertTrue(special_token_id in encoded_special_token_unsplit)
 
-                    encoded_split_special_token = tokenizer.encode(
-                        special_token, add_special_tokens=False, split_special_tokens=True
-                    )
-                    if len(encoded_split_special_token) == 1:
-                        # if we have subword tokenization or special vocab
-                        self.assertTrue(
-                            encoded_split_special_token[0] != tokenizer.convert_tokens_to_ids(special_token)
-                        )
-                    else:
-                        self.assertTrue(len(encoded_split_special_token) > 1)
+                encoded_special_token_split = tokenizer_py.encode(special_token, add_special_tokens=False)
+                self.assertTrue(special_token_id not in encoded_special_token_split)
+
+                py_tokens_output = tokenizer_py.tokenize(special_sentence)
+                rust_tokens_output = tokenizer_rust.tokenize(special_sentence)
+
+                self.assertTrue(special_token not in py_tokens_output)
+                self.assertTrue(special_token not in rust_tokens_output)
+
+                py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False)
+                rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False)
+
+                self.assertTrue(special_token in py_tokens_output_unsplit)
+                self.assertTrue(special_token in rust_tokens_output_unsplit)
+
+                py_tokens_output = tokenizer_py(special_sentence)
+                rust_tokens_output = tokenizer_rust(special_sentence)
+
+                self.assertTrue(special_token_id not in py_tokens_output)
+                self.assertTrue(special_token_id not in rust_tokens_output)
+
+                tmp_dir = tempfile.mkdtemp()
+
+                try:
+                    tokenizer_py.save_pretrained(tmp_dir)
+                    fast_from_saved = self.tokenizer_class.from_pretrained(tmp_dir)
+                finally:
+                    shutil.rmtree(tmp_dir)
+
+                output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence)
+                self.assertTrue(special_token not in output_tokens_reloaded_split)
+
+                output_tokens_reloaded_unsplit = fast_from_saved.tokenize(special_sentence, split_special_tokens=False)
+                self.assertTrue(special_token in output_tokens_reloaded_unsplit)
 
     def test_added_tokens_serialization(self):
         # Utility to test the added vocab

From bdb9106f247fca48a71eb384be25dbbd29b065a8 Mon Sep 17 00:00:00 2001
From: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
Date: Fri, 24 May 2024 19:02:55 +0200
Subject: [PATCH 73/80] Paligemma- fix devices and dtype assignments (#31008)

* fix devices and dtype assignments

* [run-slow]paligemma
---
 .../models/paligemma/modeling_paligemma.py        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 58938b99ee..e8303a7984 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -301,14 +301,15 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
         pad_mask = input_ids == self.pad_token_id
 
         # expand masks to match embedding dimension
-        text_mask_expanded = text_mask.unsqueeze(-1).expand(-1, -1, embed_dim)
-        pad_mask_expanded = pad_mask.unsqueeze(-1).expand(-1, -1, embed_dim)
+        text_mask_expanded = text_mask.unsqueeze(-1).expand(-1, -1, embed_dim).to(inputs_embeds.device)
+        pad_mask_expanded = pad_mask.unsqueeze(-1).expand(-1, -1, embed_dim).to(inputs_embeds.device)
         # insert padding and text token embeddings
         final_embedding = torch.where(text_mask_expanded, inputs_embeds, final_embedding)
         final_embedding = torch.where(pad_mask_expanded, torch.zeros_like(final_embedding), final_embedding)
         # insert image embeddings - the image mask is always less or equal to the sentence in length
         final_embedding = final_embedding.masked_scatter(
-            image_mask.unsqueeze(-1).expand_as(final_embedding), scaled_image_features
+            image_mask.unsqueeze(-1).expand_as(final_embedding).to(device=final_embedding.device),
+            scaled_image_features.to(device=final_embedding.device, dtype=final_embedding.dtype),
         )
         final_embedding = torch.where(pad_mask_expanded, torch.zeros_like(final_embedding), final_embedding)
         if attention_mask is not None:
@@ -329,10 +330,12 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
             if attention_mask is not None:
                 causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                 mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
                 # unmask the prefill
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    token_type_ids[:, None, None, :] == 0, 0
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
                 )
                 padding_mask = padding_mask == 0
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
@@ -484,7 +487,7 @@ class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel):
                 # we use the input attention mask to shift the logits and labels, because it is 2D.
                 shift_attention_mask = input_attention_mask[..., 1:]
                 shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
-                shift_labels = shift_labels[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
             else:
                 shift_logits = shift_logits.contiguous()
                 shift_labels = shift_labels.contiguous()

From 84c4b72ee99e8e65a8a5754a5f9d6265b45cf67e Mon Sep 17 00:00:00 2001
From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com>
Date: Mon, 27 May 2024 10:34:14 +0200
Subject: [PATCH 74/80] Redirect transformers_agents doc to agents (#31054)

---
 docs/source/en/_redirects.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/_redirects.yml b/docs/source/en/_redirects.yml
index b6575a6b02..3e8fd3cf99 100644
--- a/docs/source/en/_redirects.yml
+++ b/docs/source/en/_redirects.yml
@@ -1,3 +1,4 @@
 # Optimizing inference
 
 perf_infer_gpu_many: perf_infer_gpu_one
+transformers_agents: agents

From d7942d9d27476bddd48af49f1f9492de614d2fe7 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 27 May 2024 13:47:47 +0200
Subject: [PATCH 75/80] unpin uv (#31055)

[push-ci-image]

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 docker/consistency.dockerfile       | 4 ++--
 docker/custom-tokenizers.dockerfile | 4 ++--
 docker/examples-tf.dockerfile       | 4 ++--
 docker/examples-torch.dockerfile    | 4 ++--
 docker/exotic-models.dockerfile     | 4 ++--
 docker/jax-light.dockerfile         | 4 ++--
 docker/pipeline-tf.dockerfile       | 4 ++--
 docker/pipeline-torch.dockerfile    | 4 ++--
 docker/quality.dockerfile           | 4 ++--
 docker/tf-light.dockerfile          | 4 ++--
 docker/torch-jax-light.dockerfile   | 4 ++--
 docker/torch-light.dockerfile       | 4 ++--
 docker/torch-tf-light.dockerfile    | 4 ++--
 13 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile
index e516898ded..c9200799ae 100644
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 ARG REF=main
 RUN apt-get update && apt-get install -y time git pkg-config make git-lfs
-ENV VIRTUAL_ENV=/usr/local
-RUN pip install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
 RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir tensorflow-cpu tf-keras
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]" 
diff --git a/docker/custom-tokenizers.dockerfile b/docker/custom-tokenizers.dockerfile
index b65fc57774..5d95e68965 100644
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@@ -2,8 +2,8 @@ FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 
 RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
 RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
diff --git a/docker/examples-tf.dockerfile b/docker/examples-tf.dockerfile
index 2f719c2faa..9281630d3a 100644
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
 RUN apt-get install -y g++ cmake
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
 RUN pip install  --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
diff --git a/docker/examples-torch.dockerfile b/docker/examples-torch.dockerfile
index fd50e8f4e1..da9afcb801 100644
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@@ -2,8 +2,8 @@ FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
diff --git a/docker/exotic-models.dockerfile b/docker/exotic-models.dockerfile
index 248f05e91c..2371ffb91c 100644
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
diff --git a/docker/jax-light.dockerfile b/docker/jax-light.dockerfile
index 7f8f1e536e..315b526a71 100644
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
 RUN pip uninstall -y transformers
 RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
\ No newline at end of file
diff --git a/docker/pipeline-tf.dockerfile b/docker/pipeline-tf.dockerfile
index d2de44b4d7..393738ff87 100644
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile
index 8816338845..992891a54a 100644
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
diff --git a/docker/quality.dockerfile b/docker/quality.dockerfile
index 44fbcd91f0..7a4145517a 100644
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@@ -3,7 +3,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
 RUN apt-get update && apt-get install -y time git 
-ENV VIRTUAL_ENV=/usr/local
-RUN pip install uv==0.1.45 &&  uv venv
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip install uv &&  uv venv
 RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
 RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/tf-light.dockerfile b/docker/tf-light.dockerfile
index 65e6660787..7168ddae12 100644
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@@ -4,8 +4,8 @@ ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
 RUN apt-get install -y  cmake
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
 RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
 RUN pip uninstall -y transformers
diff --git a/docker/torch-jax-light.dockerfile b/docker/torch-jax-light.dockerfile
index 351eee2775..7cfa141732 100644
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-deps accelerate
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile
index ad571f79d7..524a68fd55 100644
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@@ -3,8 +3,8 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
diff --git a/docker/torch-tf-light.dockerfile b/docker/torch-tf-light.dockerfile
index 56655914d2..ac35b6be81 100644
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@@ -4,8 +4,8 @@ ARG REF=main
 RUN echo ${REF}
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
-ENV VIRTUAL_ENV=/usr/local
-RUN pip --no-cache-dir install uv==0.1.45 && uv venv && uv pip install --no-cache-dir -U pip setuptools
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

From 0a064dc0fcba31092868f911772df087901d90fb Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Mon, 27 May 2024 08:57:43 -0400
Subject: [PATCH 76/80] Follow up: Fix link in dbrx.md (#30514)

* Fix link in dbrx.md

* remove "though this may not be up to date"

---------

Co-authored-by: Lysandre Debut <hi@lysand.re>
---
 docs/source/en/model_doc/dbrx.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
index d60a4926eb..fb53742d05 100644
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@@ -31,8 +31,7 @@ We used curriculum learning for pretraining, changing the data mix during traini
 
 More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
 
-
-This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx), though this may not be up to date.
+This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct), though this may not be up to date.
 
 ## Usage Examples
 

From cd797778e4063aea27ed01374f35f95dd559d3b5 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Mon, 27 May 2024 14:16:47 +0100
Subject: [PATCH 77/80] Update feature request label in template (#30940)

---
 .github/ISSUE_TEMPLATE/feature-request.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
index 318dc1f9b2..ff0d452a80 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -1,6 +1,6 @@
 name: "\U0001F680 Feature request"
 description: Submit a proposal/request for a new transformers feature
-labels: [ "feature" ]
+labels: [ "Feature request" ]
 body:
   - type: textarea
     id: feature-request
@@ -19,7 +19,7 @@ body:
       label: Motivation
       description: |
         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-        
+
 
   - type: textarea
     id: contribution

From b84cd67526eb8bac5bfba47bac72003bacc0eb5a Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Mon, 27 May 2024 15:53:45 +0200
Subject: [PATCH 78/80] Fix quanto tests (#31062)

fix quanto tests
---
 tests/quantization/quanto_integration/test_quanto.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/quanto_integration/test_quanto.py b/tests/quantization/quanto_integration/test_quanto.py
index e662300a46..b5020f8030 100644
--- a/tests/quantization/quanto_integration/test_quanto.py
+++ b/tests/quantization/quanto_integration/test_quanto.py
@@ -413,18 +413,18 @@ class QuantoQuantizationSerializationCudaTest(QuantoQuantizationTest):
 
 
 class QuantoQuantizationQBitsTensorTest(QuantoQuantizationTest):
-    EXPECTED_OUTPUTS = "Hello my name is John, I am a young man from the Philippines"
+    EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
     weights = "int4"
 
 
 class QuantoQuantizationQBitsTensorOffloadTest(QuantoQuantizationOffloadTest):
-    EXPECTED_OUTPUTS = "Hello my name is John, I am a young man from the Philippines"
+    EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
     weights = "int4"
 
 
 @unittest.skip("Skipping test class because serialization is not supported yet")
 class QuantoQuantizationQBitsTensorSerializationTest(QuantoQuantizationSerializationTest):
-    EXPECTED_OUTPUTS = "Hello my name is John, I am a young man from the Philippines"
+    EXPECTED_OUTPUTS = "Hello my name is Nils, I am a student of the University"
     weights = "int4"
 
 

From d355741eca40b38db48bde33210404169a0eb337 Mon Sep 17 00:00:00 2001
From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
Date: Mon, 27 May 2024 16:09:05 +0200
Subject: [PATCH 79/80] Fix pad_to_max_length Whisper (#30787)

* fix pad_to_max_length Whisper

* add tests

* make style
---
 .../models/whisper/generation_whisper.py      | 11 ++-
 tests/models/whisper/test_modeling_whisper.py | 76 +++++++++++++++++++
 2 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py
index f30cfe1947..1d6639fa44 100644
--- a/src/transformers/models/whisper/generation_whisper.py
+++ b/src/transformers/models/whisper/generation_whisper.py
@@ -122,7 +122,9 @@ def _get_attr_from_logit_processors(logits_processor, logit_processor_class, att
     return None
 
 
-def _pad_to_max_length(current_segments, pad_token_id, padding="right", bos_token_tensor=None, cut_off_length=None):
+def _pad_to_max_length(
+    current_segments, pad_token_id, device, padding="right", bos_token_tensor=None, cut_off_length=None
+):
     max_total_length = 0
     sequences = []
     if padding not in ["right", "left"]:
@@ -143,7 +145,7 @@ def _pad_to_max_length(current_segments, pad_token_id, padding="right", bos_toke
         elif bos_token_tensor is not None:
             sequences.append(bos_token_tensor)
         else:
-            sequences.append(torch.tensor([]))
+            sequences.append(torch.tensor([], device=device))
 
     for i in range(len(current_segments)):
         pad_length = max_total_length - len(sequences[i])
@@ -733,7 +735,9 @@ class WhisperGenerationMixin:
             if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment")
             else current_segments
         )
-        sequences = _pad_to_max_length(final_segments, generation_config.pad_token_id, padding="right")
+        sequences = _pad_to_max_length(
+            final_segments, generation_config.pad_token_id, device=self.device, padding="right"
+        )
 
         # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
         if return_segments:
@@ -1506,6 +1510,7 @@ class WhisperGenerationMixin:
             prev_tokens = _pad_to_max_length(
                 active_segments,
                 generation_config.pad_token_id,
+                device=device,
                 padding="left",
                 bos_token_tensor=prev_ids,
                 cut_off_length=cut_off_length,
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 70e37d7649..18b1eb36cc 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -35,6 +35,7 @@ from transformers.testing_utils import (
     require_torch,
     require_torch_fp16,
     require_torch_gpu,
+    require_torch_multi_gpu,
     require_torchaudio,
     slow,
     torch_device,
@@ -2866,6 +2867,81 @@ class WhisperModelIntegrationTests(unittest.TestCase):
         for i in range(num_samples):
             assert decoded_all[i] == EXPECTED_TEXT[i]
 
+    @require_torch_gpu
+    @slow
+    def test_whisper_empty_longform(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
+        model = model.to(torch_device)
+
+        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+
+        num_samples = 8
+
+        audio = ds[:num_samples]["audio"]
+        audios = [x["array"] for x in audio]
+        audios[0][:] = np.zeros(audios[0].shape)
+
+        inputs = processor(
+            audios,
+            return_tensors="pt",
+            truncation=False,
+            padding="longest",
+            return_attention_mask=True,
+            sampling_rate=16_000,
+        )
+        inputs = inputs.to(device=torch_device)
+
+        gen_kwargs = {
+            "no_speech_threshold": 0.2,
+            "temperature": (0.0,),
+            "logprob_threshold": 0.0,  # Ignore logprob, use only no-speech prob
+            "num_beams": 5,
+            "language": "fr",
+            "task": "transcribe",
+        }
+
+        torch.manual_seed(0)
+        model.generate(**inputs, **gen_kwargs)
+
+    @require_torch_multi_gpu
+    @slow
+    def test_whisper_empty_longform_multi_gpu(self):
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny", device_map="auto")
+
+        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+
+        num_samples = 8
+
+        audio = ds[:num_samples]["audio"]
+        audios = [x["array"] for x in audio]
+        audios[0][:] = np.zeros(audios[0].shape)
+
+        inputs = processor(
+            audios,
+            return_tensors="pt",
+            truncation=False,
+            padding="longest",
+            return_attention_mask=True,
+            sampling_rate=16_000,
+        )
+        inputs = inputs.to(device=model.device)
+
+        gen_kwargs = {
+            "no_speech_threshold": 0.2,
+            "temperature": (0.0,),
+            "logprob_threshold": 0.0,  # Ignore logprob, use only no-speech prob
+            "num_beams": 5,
+            "language": "fr",
+            "task": "transcribe",
+        }
+
+        torch.manual_seed(0)
+        model.generate(**inputs, **gen_kwargs)
+
 
 def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None):
     if head_mask is None:

From 9d35edbb30625489bf286a9b15aed0c5a3119c1c Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 27 May 2024 18:36:39 +0200
Subject: [PATCH 80/80] skip `test_model_parallelism` for 2 model test classes
 (#31067)

skip

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../models/deformable_detr/test_modeling_deformable_detr.py | 6 ++++++
 tests/models/rwkv/test_modeling_rwkv.py                     | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index 2ae3e3f088..032bfcb4e1 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -232,6 +232,12 @@ class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineT
         self.model_tester = DeformableDetrModelTester(self)
         self.config_tester = ConfigTester(self, config_class=DeformableDetrConfig, has_text_modality=False)
 
+    @unittest.skip(
+        "This starts to fail since 2024/05/24, but earlier commits also fail now and affect many other tests. The error is `an illegal memory access was encountered`."
+    )
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
     def test_config(self):
         # we don't test common_properties and arguments_init as these don't apply for Deformable DETR
         self.config_tester.create_and_test_config_to_json_string()
diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py
index d2a41a863d..48c240d8e3 100644
--- a/tests/models/rwkv/test_modeling_rwkv.py
+++ b/tests/models/rwkv/test_modeling_rwkv.py
@@ -304,6 +304,12 @@ class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
             standardMsg = "%s not found in %s" % (safe_repr(member), safe_repr(container))
             self.fail(self._formatMessage(msg, standardMsg))
 
+    @unittest.skip(
+        "This starts to fail since 2024/05/24, but earlier commits also fail now and affect many other tests. The error is `an illegal memory access was encountered`."
+    )
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
     def test_config(self):
         self.config_tester.run_common_tests()