Merge branch 'main' of github.com:huggingface/transformers into diff-converter

2024-05-30 16:39:18 +02:00 · 2024-05-30 16:39:18 +02:00 · fa8a86ccd2
parent 2e7499239b cda9c82a63
commit fa8a86ccd2
210 changed files with 1985 additions and 12252 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -98,7 +98,7 @@ jobs:
    fetch_all_tests:
        working_directory: ~/transformers
        docker:
-            - image: huggingface/transformers-consistency
+            - image: huggingface/transformers-quality
        parallelism: 1
        steps:
            - checkout
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -17,50 +17,50 @@ body:
      description: |
        Your issue will be replied to more quickly if you can figure out the right person to tag with @
        If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
-        
+
        All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and
        a core maintainer will ping the right person.
-        
+
        Please tag fewer than 3 people.
-        
+
        Models:

          - text models: @ArthurZucker and @younesbelkada
          - vision models: @amyeroberts
          - speech models: @sanchit-gandhi
          - graph models: @clefourrier
-        
+
        Library:
-        
+
          - flax: @sanchit-gandhi
-          - generate: @gante
+          - generate: @zucchini-nlp (visual-language models) or @gante (all others)
          - pipelines: @Narsil
          - tensorflow: @gante and @Rocketknight1
          - tokenizers: @ArthurZucker
-          - trainer: @muellerzr and @pacman100
+          - trainer: @muellerzr @SunMarc
        
        Integrations:
        
-          - deepspeed: HF Trainer/Accelerate: @pacman100
+          - deepspeed: HF Trainer/Accelerate: @muellerzr
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
          - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada
-        
+
        Documentation: @stevhliu
-        
+
        Model hub:

          - for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator.
-        
+
        HF projects:
-        
+
          - accelerate: [different repo](https://github.com/huggingface/accelerate)
          - datasets: [different repo](https://github.com/huggingface/datasets)
          - diffusers: [different repo](https://github.com/huggingface/diffusers)
          - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
-        
+
        Maintained examples (not research project or legacy):
-        
+
          - Flax: @sanchit-gandhi
          - PyTorch: See Models above and tag the person corresponding to the modality of the example.
          - TensorFlow: @Rocketknight1
@ -101,11 +101,11 @@ body:

      placeholder: |
        Steps to reproduce the behavior:
-          
+
          1.
          2.
          3.
-          
+

  - type: textarea
    id: expected-behavior
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -47,15 +47,15 @@ Models:
 Library:

 - flax: @sanchit-gandhi
- generate: @gante
+- generate: @zucchini-nlp (visual-language models) or @gante (all others)
 - pipelines: @Narsil
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
- trainer: @muellerzr and @pacman100
+- trainer: @muellerzr and @SunMarc

 Integrations:

- deepspeed: HF Trainer/Accelerate: @pacman100
+- deepspeed: HF Trainer/Accelerate: @muellerzr
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
 - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -70,16 +70,6 @@ jobs:
    name: "Latest PyTorch + DeepSpeed"
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@ -116,16 +106,6 @@ jobs:
    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@ -202,16 +182,6 @@ jobs:
    if: inputs.image_postfix != '-push-ci'
    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@ -13,7 +13,7 @@ concurrency:
 jobs:
  latest-with-torch-nightly-docker:
    name: "Nightly PyTorch + Stable TensorFlow"
-    runs-on: ubuntu-22.04
+    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      - name: Cleanup disk
        run: |
@ -50,7 +50,7 @@ jobs:

  nightly-torch-deepspeed-docker:
    name: "Nightly PyTorch + DeepSpeed"
-    runs-on: ubuntu-22.04
+    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      - name: Cleanup disk
        run: |
--- a/.github/workflows/build-past-ci-docker-images.yml
+++ b/.github/workflows/build-past-ci-docker-images.yml
@ -16,7 +16,7 @@ jobs:
      fail-fast: false
      matrix:
        version: ["1.13", "1.12", "1.11"]
-    runs-on: ubuntu-22.04
+    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      -
        name: Set up Docker Buildx
@ -60,7 +60,7 @@ jobs:
      fail-fast: false
      matrix:
        version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"]
-    runs-on: ubuntu-22.04
+    runs-on: [intel-cpu, 8-cpu, ci]
    steps:
      -
        name: Set up Docker Buildx
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -56,7 +56,7 @@ jobs:
          nvidia-smi
      
      - name: Tailscale # In order to be able to SSH when a test fails
-        uses: huggingface/tailscale-action@v1
+        uses: huggingface/tailscale-action@main
        with:
          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -51,6 +51,10 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
 # Some tests require quanto
 RUN python3 -m pip install --no-cache-dir quanto

+# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
+# (`deformable_detr`, `rwkv`, `mra`)
+RUN python3 -m pip uninstall -y ninja
+
 # For `dinat` model
 # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
 RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels
--- a/docs/source/de/installation.md
+++ b/docs/source/de/installation.md
@ -162,7 +162,7 @@ Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE`

 ## Offline Modus

-Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `TRANSFORMERS_OFFLINE=1`, um dieses Verhalten zu aktivieren.
+Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `HF_HUB_OFFLINE=1`, um dieses Verhalten zu aktivieren.

 <Tip>

@ -179,7 +179,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog
 Führen Sie das gleiche Programm in einer Offline-Instanz mit aus:

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/en/_redirects.yml
+++ b/docs/source/en/_redirects.yml
@ -2,3 +2,4 @@

 perf_infer_gpu_many: perf_infer_gpu_one
 transformers_agents: agents
+quantization: quantization/overview
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@ -169,7 +169,7 @@ Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hu

 ## Offline mode

-Run 🤗 Transformers in a firewalled or offline environment with locally cached files by setting the environment variable `TRANSFORMERS_OFFLINE=1`.
+Run 🤗 Transformers in a firewalled or offline environment with locally cached files by setting the environment variable `HF_HUB_OFFLINE=1`.

 <Tip>

@ -178,7 +178,7 @@ Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline train
 </Tip>

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/en/model_doc/deta.md
+++ b/docs/source/en/model_doc/deta.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # DETA

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The DETA model was proposed in [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
--- a/docs/source/en/model_doc/efficientformer.md
+++ b/docs/source/en/model_doc/efficientformer.md
@ -16,28 +16,36 @@ rendered properly in your Markdown viewer.

 # EfficientFormer

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

-The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191) 
+The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191)
 by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.  EfficientFormer proposes a
 dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object
 detection and semantic segmentation.

 The abstract from the paper is the following:

-*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. 
-However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally 
-times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly 
-challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation 
-complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still 
-unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? 
-To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. 
-Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. 
-Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. 
-Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. 
-Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on 
-iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model, 
-EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can 
+*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks.
+However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally
+times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly
+challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation
+complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still
+unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance?
+To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs.
+Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm.
+Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer.
+Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices.
+Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on
+iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model,
+EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can
 reach extremely low latency on mobile devices while maintaining high performance.*

 This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
@ -93,4 +101,4 @@ The original code can be found [here](https://github.com/snap-research/Efficient
    - call

 </tf>
-</frameworkcontent>
+</frameworkcontent>
--- a/docs/source/en/model_doc/ernie_m.md
+++ b/docs/source/en/model_doc/ernie_m.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # ErnieM

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The ErnieM model was proposed in [ERNIE-M: Enhanced Multilingual Representation by Aligning
--- a/docs/source/en/model_doc/gptsan-japanese.md
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # GPTSAN-japanese

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The GPTSAN-japanese model was released in the repository by Toshiyuki Sakamoto (tanreinama).
--- a/docs/source/en/model_doc/graphormer.md
+++ b/docs/source/en/model_doc/graphormer.md
@ -1,7 +1,7 @@
 <!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.

 Licensed under the MIT License; you may not use this file except in compliance with
-the License. 
+the License.

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
@ -14,9 +14,17 @@ rendered properly in your Markdown viewer.

 # Graphormer

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

-The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  by 
+The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234)  by
 Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessing and collation, then using a modified attention.

 The abstract from the paper is the following:
--- a/docs/source/en/model_doc/jukebox.md
+++ b/docs/source/en/model_doc/jukebox.md
@ -15,6 +15,14 @@ rendered properly in your Markdown viewer.
 -->
 # Jukebox

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The Jukebox model was proposed in [Jukebox: A generative model for music](https://arxiv.org/pdf/2005.00341.pdf)
@ -27,7 +35,7 @@ The abstract from the paper is the following:
 *We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*

 As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length.
-First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
+First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution.
 The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.

 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
--- a/docs/source/en/model_doc/mega.md
+++ b/docs/source/en/model_doc/mega.md
@ -16,12 +16,20 @@ rendered properly in your Markdown viewer.

 # MEGA

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The MEGA model was proposed in [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism 
-stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA 
-while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an 
+MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism
+stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA
+while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an
 attractive option for long-document NLP tasks.

 The abstract from the paper is the following:
@ -34,8 +42,8 @@ The original code can be found [here](https://github.com/facebookresearch/mega).

 ## Usage tips

- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional. 
- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size 
+- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional.
+- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size


 ## Implementation Notes
--- a/docs/source/en/model_doc/nat.md
+++ b/docs/source/en/model_doc/nat.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # Neighborhood Attention Transformer

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 NAT was proposed in [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143)
--- a/docs/source/en/model_doc/nezha.md
+++ b/docs/source/en/model_doc/nezha.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # Nezha

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei et al.
@ -25,8 +33,8 @@ The abstract from the paper is the following:
 *The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
 due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
 In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
-representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks. 
-The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional 
+representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks.
+The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional
 Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
 Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
 achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
@ -85,4 +93,4 @@ This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The ori
 ## NezhaForQuestionAnswering

 [[autodoc]] NezhaForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/paligemma.md
+++ b/docs/source/en/model_doc/paligemma.md
@ -18,11 +18,51 @@ rendered properly in your Markdown viewer.

 ## Overview

-The PaliGemma model was proposed by Google. It is a 3B VLM composed by a Siglip-400m vision encoder and a Gemma-2B decoder linked by a multimodal linear projection. It is not a chat model with images. It cuts an image into a fixed number of VIT tokens and prepends it to an optional prompt. One particularity is that the model uses full block attention on all the image tokens plus the input text tokens. It comes in 3 resolutions, 224x224, 448x448 and 896x896 with 3 base models, with 55 fine-tuned versions for different tasks, and 2 mix models.
+The PaliGemma model was proposed in [PaliGemma – Google's Cutting-Edge Open Vision Language Model](https://huggingface.co/blog/paligemma) by Google. It is a 3B vision-language model composed by a [SigLIP](siglip) vision encoder and a [Gemma](gemma) language decoder linked by a multimodal linear projection. It cuts an image into a fixed number of VIT tokens and prepends it to an optional prompt. One particularity is that the model uses full block attention on all the image tokens plus the input text tokens. It comes in 3 resolutions, 224x224, 448x448 and 896x896 with 3 base models, with 55 fine-tuned versions for different tasks, and 2 mix models.

+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/paligemma/paligemma_arch.png"
+alt="drawing" width="600"/>
+
+<small> PaliGemma architecture. Taken from the <a href="https://huggingface.co/blog/paligemma">blog post.</a> </small>

 This model was contributed by [Molbap](https://huggingface.co/Molbap).

+## Usage tips
+
+Inference with PaliGemma can be performed as follows:
+
+```python
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+
+model_id = "google/paligemma-3b-mix-224"
+model = PaliGemmaForConditionalGeneration.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+prompt = "What is on the flower?"
+image_file = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true"
+raw_image = Image.open(requests.get(image_file, stream=True).raw)
+inputs = processor(prompt, raw_image, return_tensors="pt")
+output = model.generate(**inputs, max_new_tokens=20)
+
+print(processor.decode(output[0], skip_special_tokens=True)[len(prompt):])
+```
+
+- PaliGemma is not meant for conversational use, and it works best when fine-tuning to a specific use case. Some downstream tasks on which PaliGemma can be fine-tuned include image captioning, visual question answering (VQA), object detection, referring expression segmentation and document understanding.
+- One can use `PaliGemmaProcessor` to prepare images, text and optional labels for the model. When fine-tuning a PaliGemma model, the `suffix` argument can be passed to the processor which creates the `labels` for the model:
+
+```python
+prompt = "What is on the flower?"
+answer = "a bee"
+inputs = processor(text=prompt, images=raw_image, suffix=answer, return_tensors="pt")
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with PaliGemma. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- A blog post introducing all the features of PaliGemma can be found [here](https://huggingface.co/blog/paligemma).
+- Demo notebooks on how to fine-tune PaliGemma for VQA with the Trainer API along with inference can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/paligemma).
+- Demo notebooks on how to fine-tune PaliGemma on a custom dataset (receipt image -> JSON) along with inference can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/PaliGemma). 🌎

 ## PaliGemmaConfig

--- a/docs/source/en/model_doc/qdqbert.md
+++ b/docs/source/en/model_doc/qdqbert.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # QDQBERT

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
--- a/docs/source/en/model_doc/realm.md
+++ b/docs/source/en/model_doc/realm.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # REALM

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
@ -86,4 +94,4 @@ This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The origi

 [[autodoc]] RealmForOpenQA
    - block_embedding_to
-    - forward
+    - forward
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # Speech2Text2

+  <Tip warning={true}>
+
+  This model is in maintenance mode only, we don't accept any new PRs changing its code.
+  If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+  You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+  </Tip>
+
 ## Overview

 The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@ -38,12 +38,17 @@ to repeatedly detect a much richer set of interest points than the initial pre-a
 traditional corner detector. The final system gives rise to state-of-the-art homography estimation results on HPatches
 when compared to LIFT, SIFT and ORB.*

-## How to use
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/superpoint_architecture.png"
+alt="drawing" width="500"/>
+
+<small> SuperPoint overview. Taken from the <a href="https://arxiv.org/abs/1712.07629v4">original paper.</a> </small>
+
+## Usage tips

 Here is a quick example of using the model to detect interest points in an image:

 ```python
-from transformers import AutoImageProcessor, AutoModel
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
 import torch
 from PIL import Image
 import requests
@ -52,7 +57,7 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 image = Image.open(requests.get(url, stream=True).raw)

 processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
-model = AutoModel.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")

 inputs = processor(image, return_tensors="pt")
 outputs = model(**inputs)
@ -64,7 +69,7 @@ You can also feed multiple images to the model. Due to the nature of SuperPoint,
 you will need to use the mask attribute to retrieve the respective information :

 ```python
-from transformers import AutoImageProcessor, AutoModel
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
 import torch
 from PIL import Image
 import requests
@ -77,7 +82,7 @@ image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
 images = [image_1, image_2]

 processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
-model = AutoModel.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")

 inputs = processor(images, return_tensors="pt")
 outputs = model(**inputs)
@ -103,6 +108,12 @@ cv2.imwrite("output_image.png", image)
 This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
 The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork).

+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SuperPoint. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- A notebook showcasing inference and visualization with SuperPoint can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb). 🌎
+
 ## SuperPointConfig

 [[autodoc]] SuperPointConfig
--- a/docs/source/en/model_doc/tvlt.md
+++ b/docs/source/en/model_doc/tvlt.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # TVLT

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
@ -60,7 +68,7 @@ The original code can be found [here](https://github.com/zinengtang/TVLT). This

 [[autodoc]] TvltFeatureExtractor
    - __call__
-    
+
 ## TvltModel

 [[autodoc]] TvltModel
--- a/docs/source/en/model_doc/vit_hybrid.md
+++ b/docs/source/en/model_doc/vit_hybrid.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # Hybrid Vision Transformer (ViT Hybrid)

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 ## Overview

 The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
--- a/docs/source/en/model_doc/xclip.md
+++ b/docs/source/en/model_doc/xclip.md
@ -30,7 +30,7 @@ Tips:
 - Usage of X-CLIP is identical to [CLIP](clip).

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/xclip_architecture.png"
-alt="drawing" width="600"/> 
+alt="drawing" width="600"/>

 <small> X-CLIP architecture. Taken from the <a href="https://arxiv.org/abs/2208.02816">original paper.</a> </small>

--- a/docs/source/en/model_doc/xlm-prophetnet.md
+++ b/docs/source/en/model_doc/xlm-prophetnet.md
@ -16,6 +16,14 @@ rendered properly in your Markdown viewer.

 # XLM-ProphetNet

+<Tip warning={true}>
+
+This model is in maintenance mode only, we don't accept any new PRs changing its code.
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
+You can do so by running the following command: `pip install -U transformers==4.40.2`.
+
+</Tip>
+
 <div class="flex flex-wrap space-x-1">
 <a href="https://huggingface.co/models?filter=xprophetnet">
 <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
--- a/docs/source/en/peft.md
+++ b/docs/source/en/peft.md
@ -81,6 +81,8 @@ model = AutoModelForCausalLM.from_pretrained(model_id)
 model.load_adapter(peft_model_id)
 ```

+Check out the [API documentation](#transformers.integrations.PeftAdapterMixin) section below for more details.
+
 ## Load in 8bit or 4bit

 The `bitsandbytes` integration supports 8bit and 4bit precision data types, which are useful for loading large models because it saves memory (see the `bitsandbytes` integration [guide](./quantization#bitsandbytes-integration) to learn more). Add the `load_in_8bit` or `load_in_4bit` parameters to [`~PreTrainedModel.from_pretrained`] and set `device_map="auto"` to effectively distribute the model to your hardware:
@ -227,6 +229,19 @@ lora_config = LoraConfig(
 model.add_adapter(lora_config)
 ```

+## API docs
+
+[[autodoc]] integrations.PeftAdapterMixin
+    - load_adapter
+    - add_adapter
+    - set_adapter
+    - disable_adapters
+    - enable_adapters
+    - active_adapters
+    - get_adapter_state_dict
+
+
+

 <!--
 TODO: (@younesbelkada @stevhliu)
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -52,7 +52,7 @@ Use the table below to help you decide which quantization method to use.
 | [bitsandbytes](./bitsandbytes)                        | 🟢                       | 🔴   |     🟢     | 🔴              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/TimDettmers/bitsandbytes |
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
-| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
 | [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |

--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@ -204,7 +204,7 @@ Pass your text to the tokenizer:
 The tokenizer returns a dictionary containing:

 * [input_ids](./glossary#input-ids): numerical representations of your tokens.
-* [attention_mask](.glossary#attention-mask): indicates which tokens should be attended to.
+* [attention_mask](./glossary#attention-mask): indicates which tokens should be attended to.

 A tokenizer can also accept a list of inputs, and pad and truncate the text to return a batch with uniform length:

--- a/docs/source/es/installation.md
+++ b/docs/source/es/installation.md
@ -154,7 +154,7 @@ Los modelos preentrenados se descargan y almacenan en caché localmente en: `~/.

 ## Modo Offline

-🤗 Transformers puede ejecutarse en un entorno con firewall o fuera de línea (offline) usando solo archivos locales. Configura la variable de entorno `TRANSFORMERS_OFFLINE=1` para habilitar este comportamiento.
+🤗 Transformers puede ejecutarse en un entorno con firewall o fuera de línea (offline) usando solo archivos locales. Configura la variable de entorno `HF_HUB_OFFLINE=1` para habilitar este comportamiento.

 <Tip>

@ -171,7 +171,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog
 Ejecuta este mismo programa en una instancia offline con el siguiente comando:

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/fr/installation.md
+++ b/docs/source/fr/installation.md
@ -171,7 +171,7 @@ Les modèles pré-entraînés sont téléchargés et mis en cache localement dan

 ## Mode hors ligne

-🤗 Transformers peut fonctionner dans un environnement cloisonné ou hors ligne en n'utilisant que des fichiers locaux. Définissez la variable d'environnement `TRANSFORMERS_OFFLINE=1` pour activer ce mode.
+🤗 Transformers peut fonctionner dans un environnement cloisonné ou hors ligne en n'utilisant que des fichiers locaux. Définissez la variable d'environnement `HF_HUB_OFFLINE=1` pour activer ce mode.

 <Tip>

@ -180,7 +180,7 @@ Ajoutez [🤗 Datasets](https://huggingface.co/docs/datasets/) à votre processu
 </Tip>

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/it/installation.md
+++ b/docs/source/it/installation.md
@ -152,7 +152,7 @@ I modelli pre-allenati sono scaricati e memorizzati localmente nella cache in: `

 ## Modalità Offline

-🤗 Transformers può essere eseguita in un ambiente firewalled o offline utilizzando solo file locali. Imposta la variabile d'ambiente `TRANSFORMERS_OFFLINE=1` per abilitare questo comportamento.
+🤗 Transformers può essere eseguita in un ambiente firewalled o offline utilizzando solo file locali. Imposta la variabile d'ambiente `HF_HUB_OFFLINE=1` per abilitare questo comportamento.

 <Tip>

@ -169,7 +169,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog
 Esegui lo stesso programma in un'istanza offline con:

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/ja/installation.md
+++ b/docs/source/ja/installation.md
@ -157,7 +157,7 @@ conda install conda-forge::transformers

 ## オフラインモード

-🤗 Transformersはローカルファイルのみを使用することでファイアウォールやオフラインの環境でも動作させることができます。この動作を有効にするためには、環境変数`TRANSFORMERS_OFFLINE=1`を設定します。
+🤗 Transformersはローカルファイルのみを使用することでファイアウォールやオフラインの環境でも動作させることができます。この動作を有効にするためには、環境変数`HF_HUB_OFFLINE=1`を設定します。

 <Tip>

@ -174,7 +174,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog
 オフラインインスタンスでこの同じプログラムを実行します:

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/ko/installation.md
+++ b/docs/source/ko/installation.md
@ -157,7 +157,7 @@ conda install conda-forge::transformers

 ## 오프라인 모드[[offline-mode]]

-🤗 Transformers를 로컬 파일만 사용하도록 해서 방화벽 또는 오프라인 환경에서 실행할 수 있습니다. 활성화하려면 `TRANSFORMERS_OFFLINE=1` 환경 변수를 설정하세요.
+🤗 Transformers를 로컬 파일만 사용하도록 해서 방화벽 또는 오프라인 환경에서 실행할 수 있습니다. 활성화하려면 `HF_HUB_OFFLINE=1` 환경 변수를 설정하세요.

 <Tip>

@ -174,7 +174,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog
 오프라인 기기에서 동일한 프로그램을 다음과 같이 실행할 수 있습니다.

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/pt/installation.md
+++ b/docs/source/pt/installation.md
@ -173,7 +173,7 @@ No Windows, este diretório pré-definido é dado por `C:\Users\username\.cache\
 ## Modo Offline

 O 🤗 Transformers também pode ser executado num ambiente de firewall ou fora da rede (offline) usando arquivos locais.
-Para tal, configure a variável de ambiente de modo que `TRANSFORMERS_OFFLINE=1`.
+Para tal, configure a variável de ambiente de modo que `HF_HUB_OFFLINE=1`.

 <Tip>

@ -191,7 +191,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog
 Execute esse mesmo programa numa instância offline com o seguinte comando:

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/docs/source/zh/installation.md
+++ b/docs/source/zh/installation.md
@ -169,7 +169,7 @@ conda install conda-forge::transformers

 ## 离线模式

-🤗 Transformers 可以仅使用本地文件在防火墙或离线环境中运行。设置环境变量 `TRANSFORMERS_OFFLINE=1` 以启用该行为。
+🤗 Transformers 可以仅使用本地文件在防火墙或离线环境中运行。设置环境变量 `HF_HUB_OFFLINE=1` 以启用该行为。

 <Tip>

@ -186,7 +186,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog
 在离线环境中运行相同的程序：

 ```bash
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

--- a/src/transformers/init.py
+++ b/src/transformers/init.py
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -1,17 +1,21 @@
 import copy
+import importlib.metadata
 import json
 import os
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
+from packaging import version

 from .configuration_utils import PretrainedConfig
 from .utils import is_hqq_available, is_quanto_available, logging


 if is_quanto_available():
-    from quanto import QBitsTensor, qint2, qint4
+    quanto_version = version.parse(importlib.metadata.version("quanto"))
+    if quanto_version >= version.parse("0.2.0"):
+        from quanto import AffineQuantizer, MaxOptimizer, qint2, qint4

 if is_hqq_available():
    from hqq.core.quantize import Quantizer as HQQQuantizer
@ -488,6 +492,13 @@ class QuantoQuantizedCache(QuantizedCache):

    def __init__(self, cache_config: CacheConfig) -> None:
        super().__init__(cache_config)
+        quanto_version = version.parse(importlib.metadata.version("quanto"))
+        if quanto_version < version.parse("0.2.0"):
+            raise ImportError(
+                f"You need quanto package version to be greater or equal than 0.2.0 to use `QuantoQuantizedCache`. Detected version {quanto_version}. "
+                f"Please upgrade quanto with `pip install -U quanto`"
+            )
+
        if self.nbits not in [2, 4]:
            raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}")

@ -500,9 +511,11 @@ class QuantoQuantizedCache(QuantizedCache):
            )

        self.qtype = qint4 if self.nbits == 4 else qint2
+        self.optimizer = MaxOptimizer()  # hardcode as it's the only one for per-channel quantization

    def _quantize(self, tensor, axis):
-        qtensor = QBitsTensor.quantize(tensor, axis=axis, qtype=self.qtype, group_size=self.q_group_size)
+        scale, zeropoint = self.optimizer(tensor, self.qtype.bits, axis, self.q_group_size)
+        qtensor = AffineQuantizer.apply(tensor, self.qtype, axis, self.q_group_size, scale, zeropoint)
        return qtensor

    def _dequantize(self, qtensor):
--- a/src/transformers/commands/env.py
+++ b/src/transformers/commands/env.py
@ -26,6 +26,7 @@ from ..utils import (
    is_safetensors_available,
    is_tf_available,
    is_torch_available,
+    is_torch_npu_available,
 )
 from . import BaseTransformersCLICommand

@ -88,6 +89,7 @@ class EnvironmentCommand(BaseTransformersCLICommand):

            pt_version = torch.__version__
            pt_cuda_available = torch.cuda.is_available()
+            pt_npu_available = is_torch_npu_available()

        tf_version = "not installed"
        tf_cuda_available = "NA"
@ -129,9 +131,16 @@ class EnvironmentCommand(BaseTransformersCLICommand):
            "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
            "Jax version": f"{jax_version}",
            "JaxLib version": f"{jaxlib_version}",
-            "Using GPU in script?": "<fill in>",
            "Using distributed or parallel set-up in script?": "<fill in>",
        }
+        if is_torch_available():
+            if pt_cuda_available:
+                info["Using GPU in script?"] = "<fill in>"
+                info["GPU type"] = torch.cuda.get_device_name()
+            elif pt_npu_available:
+                info["Using NPU in script?"] = "<fill in>"
+                info["NPU type"] = torch.npu.get_device_name()
+                info["CANN version"] = torch.version.cann

        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
        print(self.format_dict(info))
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -536,9 +536,9 @@ class PretrainedConfig(PushToHubMixin):
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force to (re-)download the configuration files and override the cached versions if
                they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
-                exists.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@ -198,7 +198,10 @@ def get_class_in_module(class_name: str, module_path: Union[str, os.PathLike]) -
    Returns:
        `typing.Type`: The class looked for.
    """
-    name = os.path.normpath(module_path).rstrip(".py").replace(os.path.sep, ".")
+    name = os.path.normpath(module_path)
+    if name.endswith(".py"):
+        name = name[:-3]
+    name = name.replace(os.path.sep, ".")
    module_spec = importlib.util.spec_from_file_location(name, location=Path(HF_MODULES_CACHE) / module_path)
    module = sys.modules.get(name)
    if module is None:
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@ -823,6 +823,8 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
                            "revision": revision,
                            "proxies": proxies,
                            "token": token,
+                            "cache_dir": cache_dir,
+                            "local_files_only": local_files_only,
                        }
                        if has_file(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **has_file_kwargs):
                            is_sharded = True
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@ -2864,6 +2864,8 @@ class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushT
                            "revision": revision,
                            "proxies": proxies,
                            "token": token,
+                            "cache_dir": cache_dir,
+                            "local_files_only": local_files_only,
                        }
                        if has_file(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **has_file_kwargs):
                            is_sharded = True
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -3048,6 +3048,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                " ignored."
            )

+        if gguf_file is not None and not is_accelerate_available():
+            raise ValueError("accelerate is required when loading a GGUF file `pip install accelerate`.")
+
        if commit_hash is None:
            if not isinstance(config, PretrainedConfig):
                # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
@ -3392,70 +3395,75 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                        )
                        if resolved_archive_file is not None:
                            is_sharded = True
-
-                    if not local_files_only and resolved_archive_file is not None:
-                        if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]:
-                            # If the PyTorch file was found, check if there is a safetensors file on the repository
-                            # If there is no safetensors file on the repositories, start an auto conversion
-                            safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME
+                    if not local_files_only and not is_offline_mode():
+                        if resolved_archive_file is not None:
+                            if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]:
+                                # If the PyTorch file was found, check if there is a safetensors file on the repository
+                                # If there is no safetensors file on the repositories, start an auto conversion
+                                safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME
+                                has_file_kwargs = {
+                                    "revision": revision,
+                                    "proxies": proxies,
+                                    "token": token,
+                                    "cache_dir": cache_dir,
+                                    "local_files_only": local_files_only,
+                                }
+                                cached_file_kwargs = {
+                                    "cache_dir": cache_dir,
+                                    "force_download": force_download,
+                                    "resume_download": resume_download,
+                                    "local_files_only": local_files_only,
+                                    "user_agent": user_agent,
+                                    "subfolder": subfolder,
+                                    "_raise_exceptions_for_gated_repo": False,
+                                    "_raise_exceptions_for_missing_entries": False,
+                                    "_commit_hash": commit_hash,
+                                    **has_file_kwargs,
+                                }
+                                if not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs):
+                                    Thread(
+                                        target=auto_conversion,
+                                        args=(pretrained_model_name_or_path,),
+                                        kwargs={"ignore_errors_during_conversion": True, **cached_file_kwargs},
+                                        name="Thread-autoconversion",
+                                    ).start()
+                        else:
+                            # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file.
+                            # We try those to give a helpful error message.
                            has_file_kwargs = {
                                "revision": revision,
                                "proxies": proxies,
                                "token": token,
-                            }
-                            cached_file_kwargs = {
                                "cache_dir": cache_dir,
-                                "force_download": force_download,
-                                "resume_download": resume_download,
                                "local_files_only": local_files_only,
-                                "user_agent": user_agent,
-                                "subfolder": subfolder,
-                                "_raise_exceptions_for_gated_repo": False,
-                                "_raise_exceptions_for_missing_entries": False,
-                                "_commit_hash": commit_hash,
-                                **has_file_kwargs,
                            }
-                            if not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs):
-                                Thread(
-                                    target=auto_conversion,
-                                    args=(pretrained_model_name_or_path,),
-                                    kwargs={"ignore_errors_during_conversion": True, **cached_file_kwargs},
-                                    name="Thread-autoconversion",
-                                ).start()
-                    else:
-                        # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file.
-                        # We try those to give a helpful error message.
-                        has_file_kwargs = {
-                            "revision": revision,
-                            "proxies": proxies,
-                            "token": token,
-                        }
-                        if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
-                            raise EnvironmentError(
-                                f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights."
-                                " Use `from_tf=True` to load this model from those weights."
-                            )
-                        elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs):
-                            raise EnvironmentError(
-                                f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use"
-                                " `from_flax=True` to load this model from those weights."
-                            )
-                        elif variant is not None and has_file(
-                            pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
-                        ):
-                            raise EnvironmentError(
-                                f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
-                                f" {variant}. Use `variant=None` to load this model from those weights."
-                            )
-                        else:
-                            raise EnvironmentError(
-                                f"{pretrained_model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
-                                f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
-                            )
+                            if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs):
+                                raise EnvironmentError(
+                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                    f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights."
+                                    " Use `from_tf=True` to load this model from those weights."
+                                )
+                            elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs):
+                                raise EnvironmentError(
+                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                    f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use"
+                                    " `from_flax=True` to load this model from those weights."
+                                )
+                            elif variant is not None and has_file(
+                                pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs
+                            ):
+                                raise EnvironmentError(
+                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                    f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
+                                    f" {variant}. Use `variant=None` to load this model from those weights."
+                                )
+                            else:
+                                raise EnvironmentError(
+                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                    f" {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)},"
+                                    f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
+                                )
+
                except EnvironmentError:
                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
                    # to the original exception.
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -67,7 +67,6 @@ from . import (
    deit,
    deprecated,
    depth_anything,
-    deta,
    detr,
    dialogpt,
    dinat,
@ -77,13 +76,11 @@ from . import (
    donut,
    dpr,
    dpt,
-    efficientformer,
    efficientnet,
    electra,
    encodec,
    encoder_decoder,
    ernie,
-    ernie_m,
    esm,
    falcon,
    fastspeech2_conformer,
@ -104,8 +101,6 @@ from . import (
    gpt_neox_japanese,
    gpt_sw3,
    gptj,
-    gptsan_japanese,
-    graphormer,
    grounding_dino,
    groupvit,
    herbert,
@ -118,7 +113,6 @@ from . import (
    instructblip,
    jamba,
    jetmoe,
-    jukebox,
    kosmos2,
    layoutlm,
    layoutlmv2,
@ -142,7 +136,6 @@ from . import (
    maskformer,
    mbart,
    mbart50,
-    mega,
    megatron_bert,
    megatron_gpt2,
    mgp_str,
@ -161,8 +154,6 @@ from . import (
    musicgen,
    musicgen_melody,
    mvp,
-    nat,
-    nezha,
    nllb,
    nllb_moe,
    nougat,
@ -190,11 +181,9 @@ from . import (
    prophetnet,
    pvt,
    pvt_v2,
-    qdqbert,
    qwen2,
    qwen2_moe,
    rag,
-    realm,
    recurrent_gemma,
    reformer,
    regnet,
@ -215,7 +204,6 @@ from . import (
    siglip,
    speech_encoder_decoder,
    speech_to_text,
-    speech_to_text_2,
    speecht5,
    splinter,
    squeezebert,
@ -234,7 +222,6 @@ from . import (
    timesformer,
    timm_backbone,
    trocr,
-    tvlt,
    tvp,
    udop,
    umt5,
@ -250,7 +237,6 @@ from . import (
    vision_text_dual_encoder,
    visual_bert,
    vit,
-    vit_hybrid,
    vit_mae,
    vit_msn,
    vitdet,
@ -267,7 +253,6 @@ from . import (
    x_clip,
    xglm,
    xlm,
-    xlm_prophetnet,
    xlm_roberta,
    xlm_roberta_xl,
    xlnet,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -585,14 +585,29 @@ MODEL_NAMES_MAPPING = OrderedDict(
 # `transfo-xl` (as in `CONFIG_MAPPING_NAMES`), we should use `transfo_xl`.
 DEPRECATED_MODELS = [
    "bort",
+    "deta",
+    "efficientformer",
+    "ernie_m",
+    "gptsan_japanese",
+    "graphormer",
+    "jukebox",
    "mctct",
+    "mega",
    "mmbt",
+    "nat",
+    "nezha",
    "open_llama",
+    "qdqbert",
+    "realm",
    "retribert",
+    "speech_to_text_2",
    "tapex",
    "trajectory_transformer",
    "transfo_xl",
+    "tvlt",
    "van",
+    "vit_hybrid",
+    "xlm_prophetnet",
 ]

 SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
@ -616,7 +631,11 @@ def model_type_to_module_name(key):
    """Converts a config key to the corresponding module."""
    # Special treatment
    if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
-        return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+        key = SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+
+        if key in DEPRECATED_MODELS:
+            key = f"deprecated.{key}"
+        return key

    key = key.replace("-", "_")
    if key in DEPRECATED_MODELS:
--- a/src/transformers/models/deprecated/deta/init.py
+++ b/src/transformers/models/deprecated/deta/init.py
@ -14,7 +14,7 @@

 from typing import TYPE_CHECKING

-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available


 _import_structure = {
--- a/src/transformers/models/deprecated/deta/configuration_deta.py
+++ b/src/transformers/models/deprecated/deta/configuration_deta.py
@ -14,9 +14,9 @@
 # limitations under the License.
 """DETA model configuration"""

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ..auto import CONFIG_MAPPING
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ...auto import CONFIG_MAPPING


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
+++ b/src/transformers/models/deprecated/deta/convert_deta_resnet_to_pytorch.py
--- a/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
+++ b/src/transformers/models/deprecated/deta/convert_deta_swin_to_pytorch.py
--- a/src/transformers/models/deprecated/deta/image_processing_deta.py
+++ b/src/transformers/models/deprecated/deta/image_processing_deta.py
@ -19,9 +19,9 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

 import numpy as np

-from ...feature_extraction_utils import BatchFeature
-from ...image_processing_utils import BaseImageProcessor, get_size_dict
-from ...image_transforms import (
+from ....feature_extraction_utils import BatchFeature
+from ....image_processing_utils import BaseImageProcessor, get_size_dict
+from ....image_transforms import (
    PaddingMode,
    center_to_corners_format,
    corners_to_center_format,
@ -31,7 +31,7 @@ from ...image_transforms import (
    rgb_to_id,
    to_channel_dimension_format,
 )
-from ...image_utils import (
+from ....image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    AnnotationFormat,
@ -48,7 +48,7 @@ from ...image_utils import (
    validate_annotations,
    validate_preprocess_arguments,
 )
-from ...utils import (
+from ....utils import (
    is_flax_available,
    is_jax_tensor,
    is_tf_available,
@ -59,7 +59,7 @@ from ...utils import (
    is_vision_available,
    logging,
 )
-from ...utils.generic import TensorType
+from ....utils.generic import TensorType


 if is_torch_available():
--- a/src/transformers/models/deprecated/deta/modeling_deta.py
+++ b/src/transformers/models/deprecated/deta/modeling_deta.py
@ -28,8 +28,8 @@ from torch import Tensor, nn
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable

-from ...activations import ACT2FN
-from ...file_utils import (
+from ....activations import ACT2FN
+from ....file_utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
@ -38,12 +38,12 @@ from ...file_utils import (
    is_vision_available,
    replace_return_docstrings,
 )
-from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
-from ...modeling_outputs import BaseModelOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import meshgrid
-from ...utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
-from ...utils.backbone_utils import load_backbone
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ....modeling_outputs import BaseModelOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import meshgrid
+from ....utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
+from ....utils.backbone_utils import load_backbone
 from .configuration_deta import DetaConfig


--- a/src/transformers/models/deprecated/efficientformer/init.py
+++ b/src/transformers/models/deprecated/efficientformer/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import (
+from ....utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_tf_available,
--- a/src/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
@ -16,8 +16,8 @@

 from typing import List

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
--- a/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
@ -18,13 +18,13 @@ from typing import Dict, List, Optional, Union

 import numpy as np

-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
-from ...image_transforms import (
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
    get_resize_output_image_size,
    resize,
    to_channel_dimension_format,
 )
-from ...image_utils import (
+from ....image_utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    ChannelDimension,
@ -38,7 +38,7 @@ from ...image_utils import (
    validate_kwargs,
    validate_preprocess_arguments,
 )
-from ...utils import TensorType, logging
+from ....utils import TensorType, logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
@ -23,10 +23,10 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
--- a/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
+++ b/src/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
@ -20,13 +20,13 @@ from typing import Optional, Tuple, Union

 import tensorflow as tf

-from ...activations_tf import ACT2FN
-from ...modeling_tf_outputs import (
+from ....activations_tf import ACT2FN
+from ....modeling_tf_outputs import (
    TFBaseModelOutput,
    TFBaseModelOutputWithPooling,
    TFImageClassifierOutput,
 )
-from ...modeling_tf_utils import (
+from ....modeling_tf_utils import (
    TFPreTrainedModel,
    TFSequenceClassificationLoss,
    get_initializer,
@ -34,8 +34,8 @@ from ...modeling_tf_utils import (
    keras_serializable,
    unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
-from ...utils import (
+from ....tf_utils import shape_list, stable_softmax
+from ....utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
--- a/src/transformers/models/deprecated/ernie_m/init.py
+++ b/src/transformers/models/deprecated/ernie_m/init.py
@ -14,7 +14,7 @@
 from typing import TYPE_CHECKING

 # rely on isort to merge the imports
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available


 _import_structure = {
--- a/src/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
+++ b/src/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
@ -19,7 +19,7 @@ from __future__ import annotations

 from typing import Dict

-from ...configuration_utils import PretrainedConfig
+from ....configuration_utils import PretrainedConfig


 class ErnieMConfig(PretrainedConfig):
--- a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
+++ b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
@ -22,8 +22,8 @@ import torch.utils.checkpoint
 from torch import nn, tensor
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MultipleChoiceModelOutput,
@ -31,9 +31,9 @@ from ...modeling_outputs import (
    SequenceClassifierOutput,
    TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_ernie_m import ErnieMConfig


--- a/src/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
+++ b/src/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
@ -21,8 +21,8 @@ from typing import Any, Dict, List, Optional, Tuple

 import sentencepiece as spm

-from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/gptsan_japanese/init.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/init.py
@ -14,7 +14,7 @@

 from typing import TYPE_CHECKING

-from ...utils import (
+from ....utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
--- a/src/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
@ -14,8 +14,8 @@
 # limitations under the License.
 """GPTSAN-japanese model configuration"""

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/convert_gptsan_tf_checkpoint_to_pytorch.py
--- a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
@ -20,10 +20,10 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn

-from ...activations import ACT2FN
-from ...modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
    DUMMY_INPUTS,
    DUMMY_MASK,
    add_start_docstrings,
--- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
+++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
@ -22,8 +22,8 @@ from typing import List, Optional, Tuple, Union

 import numpy as np

-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_base import (
+from ....tokenization_utils import PreTrainedTokenizer
+from ....tokenization_utils_base import (
    BatchEncoding,
    PreTokenizedInput,
    PreTokenizedInputPair,
@ -31,7 +31,7 @@ from ...tokenization_utils_base import (
    TextInputPair,
    TruncationStrategy,
 )
-from ...utils import PaddingStrategy, logging
+from ....utils import PaddingStrategy, logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/graphormer/init.py
+++ b/src/transformers/models/deprecated/graphormer/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available


 _import_structure = {
--- a/src/transformers/models/deprecated/graphormer/algos_graphormer.pyx
+++ b/src/transformers/models/deprecated/graphormer/algos_graphormer.pyx
--- a/src/transformers/models/deprecated/graphormer/collating_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/collating_graphormer.py
@ -6,7 +6,7 @@ from typing import Any, Dict, List, Mapping
 import numpy as np
 import torch

-from ...utils import is_cython_available, requires_backends
+from ....utils import is_cython_available, requires_backends


 if is_cython_available():
--- a/src/transformers/models/deprecated/graphormer/configuration_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/configuration_graphormer.py
@ -14,8 +14,8 @@
 # limitations under the License.
 """Graphormer model configuration"""

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/graphormer/modeling_graphormer.py
+++ b/src/transformers/models/deprecated/graphormer/modeling_graphormer.py
@ -21,13 +21,13 @@ import torch
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
    BaseModelOutputWithNoAttention,
    SequenceClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ....modeling_utils import PreTrainedModel
+from ....utils import logging
 from .configuration_graphormer import GraphormerConfig


--- a/src/transformers/models/deprecated/jukebox/init.py
+++ b/src/transformers/models/deprecated/jukebox/init.py
@ -14,7 +14,7 @@

 from typing import TYPE_CHECKING

-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available


 _import_structure = {
--- a/src/transformers/models/deprecated/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/configuration_jukebox.py
@ -17,8 +17,8 @@
 import os
 from typing import List, Union

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/jukebox/convert_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/convert_jukebox.py
--- a/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/modeling_jukebox.py
@ -24,10 +24,10 @@ import torch.nn.functional as F
 from torch import nn
 from torch.nn import LayerNorm as FusedLayerNorm

-from ...activations import ACT2FN
-from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, logging
-from ...utils.logging import tqdm
+from ....activations import ACT2FN
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ....utils.logging import tqdm
 from .configuration_jukebox import ATTENTION_PATTERNS, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig


--- a/src/transformers/models/deprecated/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/deprecated/jukebox/tokenization_jukebox.py
@ -24,10 +24,10 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 import regex

-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_base import BatchEncoding
-from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
-from ...utils.generic import _is_jax, _is_numpy
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
+from ....utils.generic import _is_jax, _is_numpy


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/mega/init.py
+++ b/src/transformers/models/deprecated/mega/init.py
@ -14,7 +14,7 @@

 from typing import TYPE_CHECKING

-from ...utils import (
+from ....utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
--- a/src/transformers/models/deprecated/mega/configuration_mega.py
+++ b/src/transformers/models/deprecated/mega/configuration_mega.py
@ -17,9 +17,9 @@
 from collections import OrderedDict
 from typing import Mapping

-from ...configuration_utils import PretrainedConfig
-from ...onnx import OnnxConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....onnx import OnnxConfig
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/mega/convert_mega_original_pytorch_checkpoint_to_pytorch.py
--- a/src/transformers/models/deprecated/mega/modeling_mega.py
+++ b/src/transformers/models/deprecated/mega/modeling_mega.py
@ -23,8 +23,8 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
@ -33,9 +33,9 @@ from ...modeling_outputs import (
    SequenceClassifierOutput,
    TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import ALL_LAYERNORM_LAYERS
+from ....utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
--- a/src/transformers/models/deprecated/nat/init.py
+++ b/src/transformers/models/deprecated/nat/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available


 _import_structure = {"configuration_nat": ["NatConfig"]}
--- a/src/transformers/models/deprecated/nat/configuration_nat.py
+++ b/src/transformers/models/deprecated/nat/configuration_nat.py
@ -14,9 +14,9 @@
 # limitations under the License.
 """Neighborhood Attention Transformer model configuration"""

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ....utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/nat/modeling_nat.py
+++ b/src/transformers/models/deprecated/nat/modeling_nat.py
@ -23,11 +23,11 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ...activations import ACT2FN
-from ...modeling_outputs import BackboneOutput
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
+from ....activations import ACT2FN
+from ....modeling_outputs import BackboneOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
    ModelOutput,
    OptionalDependencyNotAvailable,
    add_code_sample_docstrings,
@ -38,7 +38,7 @@ from ...utils import (
    replace_return_docstrings,
    requires_backends,
 )
-from ...utils.backbone_utils import BackboneMixin
+from ....utils.backbone_utils import BackboneMixin
 from .configuration_nat import NatConfig


--- a/src/transformers/models/deprecated/nezha/init.py
+++ b/src/transformers/models/deprecated/nezha/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available


 _import_structure = {
--- a/src/transformers/models/deprecated/nezha/configuration_nezha.py
+++ b/src/transformers/models/deprecated/nezha/configuration_nezha.py
@ -1,4 +1,4 @@
-from ... import PretrainedConfig
+from .... import PretrainedConfig


 class NezhaConfig(PretrainedConfig):
--- a/src/transformers/models/deprecated/nezha/modeling_nezha.py
+++ b/src/transformers/models/deprecated/nezha/modeling_nezha.py
@ -25,8 +25,8 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
@ -36,9 +36,9 @@ from ...modeling_outputs import (
    SequenceClassifierOutput,
    TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
--- a/src/transformers/models/deprecated/qdqbert/init.py
+++ b/src/transformers/models/deprecated/qdqbert/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available


 _import_structure = {"configuration_qdqbert": ["QDQBertConfig"]}
--- a/src/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
@ -14,8 +14,8 @@
 # limitations under the License.
 """QDQBERT model configuration"""

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
@ -25,8 +25,8 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
@ -37,9 +37,9 @@ from ...modeling_outputs import (
    SequenceClassifierOutput,
    TokenClassifierOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
--- a/src/transformers/models/deprecated/realm/init.py
+++ b/src/transformers/models/deprecated/realm/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available


 _import_structure = {
--- a/src/transformers/models/deprecated/realm/configuration_realm.py
+++ b/src/transformers/models/deprecated/realm/configuration_realm.py
@ -14,8 +14,8 @@
 # limitations under the License.
 """REALM model configuration."""

-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/realm/modeling_realm.py
+++ b/src/transformers/models/deprecated/realm/modeling_realm.py
@ -23,16 +23,16 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss

-from ...activations import ACT2FN
-from ...modeling_outputs import (
+from ....activations import ACT2FN
+from ....modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    ModelOutput,
 )
-from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_realm import RealmConfig


--- a/src/transformers/models/deprecated/realm/retrieval_realm.py
+++ b/src/transformers/models/deprecated/realm/retrieval_realm.py
@ -20,8 +20,8 @@ from typing import Optional, Union
 import numpy as np
 from huggingface_hub import hf_hub_download

-from ... import AutoTokenizer
-from ...utils import logging
+from .... import AutoTokenizer
+from ....utils import logging


 _REALM_BLOCK_RECORDS_FILENAME = "block_records.npy"
--- a/src/transformers/models/deprecated/realm/tokenization_realm.py
+++ b/src/transformers/models/deprecated/realm/tokenization_realm.py
@ -19,9 +19,9 @@ import os
 import unicodedata
 from typing import List, Optional, Tuple

-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...tokenization_utils_base import BatchEncoding
-from ...utils import PaddingStrategy, logging
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import PaddingStrategy, logging


 logger = logging.get_logger(__name__)
--- a/src/transformers/models/deprecated/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/deprecated/realm/tokenization_realm_fast.py
@ -19,9 +19,9 @@ from typing import List, Optional, Tuple

 from tokenizers import normalizers

-from ...tokenization_utils_base import BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import PaddingStrategy, logging
+from ....tokenization_utils_base import BatchEncoding
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import PaddingStrategy, logging
 from .tokenization_realm import RealmTokenizer


--- a/src/transformers/models/deprecated/speech_to_text_2/init.py
+++ b/src/transformers/models/deprecated/speech_to_text_2/init.py
@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING

-from ...utils import (
+from ....utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
--- a/Show More
+++ b/Show More