diff --git a/.circleci/config.yml b/.circleci/config.yml
index 70da5dd90f2a89..b87a0aeabb8c5d 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -134,11 +134,10 @@ jobs:
                 command: pip freeze | tee installed.txt
             - store_artifacts:
                   path: ~/transformers/installed.txt
-            - run: black --check --preview examples tests src utils
-            - run: isort --check-only examples tests src utils
+            - run: black --check examples tests src utils
+            - run: ruff examples tests src utils
             - run: python utils/custom_init_isort.py --check_only
             - run: python utils/sort_auto_mappings.py --check_only
-            - run: flake8 examples tests src utils
             - run: doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
             - run: python utils/check_doc_toc.py
 
diff --git a/Makefile b/Makefile
index 2febcfe85ebdf1..0ca30634790955 100644
--- a/Makefile
+++ b/Makefile
@@ -9,9 +9,8 @@ modified_only_fixup:
 	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
 	@if test -n "$(modified_py_files)"; then \
 		echo "Checking/fixing $(modified_py_files)"; \
-		black --preview $(modified_py_files); \
-		isort $(modified_py_files); \
-		flake8 $(modified_py_files); \
+		black $(modified_py_files); \
+		ruff $(modified_py_files) --fix; \
 	else \
 		echo "No library .py files were modified"; \
 	fi
@@ -48,11 +47,10 @@ repo-consistency:
 # this target runs checks on all files
 
 quality:
-	black --check --preview $(check_dirs)
-	isort --check-only $(check_dirs)
+	black --check $(check_dirs)
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
-	flake8 $(check_dirs)
+	ruff $(check_dirs)
 	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
 	python utils/check_doc_toc.py
 
@@ -67,8 +65,8 @@ extra_style_checks:
 # this target runs checks on all files and potentially modifies some of them
 
 style:
-	black --preview $(check_dirs)
-	isort $(check_dirs)
+	black $(check_dirs)
+	ruff $(check_dirs) --fix
 	${MAKE} autogenerate_code
 	${MAKE} extra_style_checks
 
diff --git a/docs/source/en/pipeline_webserver.mdx b/docs/source/en/pipeline_webserver.mdx
index d9f12fa2b3a0d6..f62985ec26b5bb 100644
--- a/docs/source/en/pipeline_webserver.mdx
+++ b/docs/source/en/pipeline_webserver.mdx
@@ -96,7 +96,7 @@ while True:
     queues.append(rq)
 strings
 outs = pipe(strings, batch_size=len(strings))
-for (rq, out) in zip(queues, outs):
+for rq, out in zip(queues, outs):
     await rq.put(out)
 ```
 
diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx
index 2b66c96eccadd6..4c31cd2841895f 100644
--- a/docs/source/en/tasks/asr.mdx
+++ b/docs/source/en/tasks/asr.mdx
@@ -166,7 +166,6 @@ Unlike other data collators, this specific data collator needs to apply a differ
 
 >>> @dataclass
 ... class DataCollatorCTCWithPadding:
-
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"
 
diff --git a/docs/source/en/tasks/object_detection.mdx b/docs/source/en/tasks/object_detection.mdx
index 7cac02d3ef304d..411ed7d2e7393f 100644
--- a/docs/source/en/tasks/object_detection.mdx
+++ b/docs/source/en/tasks/object_detection.mdx
@@ -213,7 +213,6 @@ The `image_processor` expects the annotations to be in the following format: `{'
 
 ```py
 >>> def formatted_anns(image_id, category, area, bbox):
-
 ...     annotations = []
 ...     for i in range(0, len(category)):
 ...         new_ann = {
@@ -399,6 +398,7 @@ First, prepare the `cppe5["test"]` set: format the annotations and save the data
 ```py
 >>> import json
 
+
 >>> # format annotations the same as for training, no need for data augmentation
 >>> def val_formatted_anns(image_id, objects):
 ...     annotations = []
diff --git a/docs/source/es/tasks/asr.mdx b/docs/source/es/tasks/asr.mdx
index 7d331b11f7eaee..f3747a332d7f42 100644
--- a/docs/source/es/tasks/asr.mdx
+++ b/docs/source/es/tasks/asr.mdx
@@ -159,7 +159,6 @@ A diferencia de otros collators de datos, este tiene que aplicarle un método de
 
 >>> @dataclass
 ... class DataCollatorCTCWithPadding:
-
 ...     processor: AutoProcessor
 ...     padding: Union[bool, str] = "longest"
 
diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index a22437f504e5cb..66bd7290758108 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -29,23 +29,23 @@
 from typing import Callable, Optional
 
 import datasets
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-from datasets import Dataset, load_dataset
-from PIL import Image
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from filelock import FileLock
 from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from huggingface_hub import Repository, create_repo
+from PIL import Image
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     AutoImageProcessor,
     AutoTokenizer,
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index 2b8d07539e9b5a..0a97bffd9304b0 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -32,20 +32,20 @@
 from pathlib import Path
 from typing import Dict, List, Optional
 
-import nltk
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import nltk
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 0d516878bcb684..607c9bb1ee7c88 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -34,19 +34,19 @@
 from typing import Callable, Optional
 
 import datasets
-import numpy as np
-from datasets import Dataset, load_dataset
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index a2b45b12a224bf..6a06533b14e419 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -34,19 +34,19 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index 9fb7bdce0dcd9a..814d68a88e3716 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -33,19 +33,19 @@
 from pathlib import Path
 from typing import Dict, List, Optional
 
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index eb86b98a27ab38..628b9b81b286c0 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -31,20 +31,21 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import load_dataset
 from flax import struct, traverse_util
 from flax.jax_utils import pad_shard_unpad, replicate, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+from utils_qa import postprocess_qa_predictions
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -55,7 +56,6 @@
     is_tensorboard_available,
 )
 from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
-from utils_qa import postprocess_qa_predictions
 
 
 logger = logging.getLogger(__name__)
@@ -301,6 +301,7 @@ def __post_init__(self):
 
 # endregion
 
+
 # region Create a train state
 def create_train_state(
     model: FlaxAutoModelForQuestionAnswering,
@@ -387,6 +388,7 @@ def create_learning_rate_fn(
 
 # endregion
 
+
 # region train data iterator
 def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
     """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
@@ -405,6 +407,7 @@ def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
 
 # endregion
 
+
 # region eval data iterator
 def eval_data_collator(dataset: Dataset, batch_size: int):
     """Returns batches of size `batch_size` from `eval dataset`. Sharding handled by `pad_shard_unpad` in the eval loop."""
@@ -934,7 +937,6 @@ def eval_step(state, batch):
     total_steps = step_per_epoch * num_epochs
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
-
         train_start = time.time()
         train_metrics = []
 
@@ -975,7 +977,6 @@ def eval_step(state, batch):
                 and (cur_step % training_args.eval_steps == 0 or cur_step % step_per_epoch == 0)
                 and cur_step > 0
             ):
-
                 eval_metrics = {}
                 all_start_logits = []
                 all_end_logits = []
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index 361746e2826e72..feda69592070f0 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -31,22 +31,22 @@
 from typing import Callable, Optional
 
 import datasets
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-from datasets import Dataset, load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import nltk  # Here to have a nice missing dependency error message early on
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from filelock import FileLock
 from flax import jax_utils, traverse_util
 from flax.jax_utils import pad_shard_unpad, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 6032408c685117..c47ea90d392a3d 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -26,20 +26,20 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import load_dataset
 from flax import struct, traverse_util
 from flax.jax_utils import pad_shard_unpad, replicate, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -586,7 +586,6 @@ def eval_step(state, batch):
     total_steps = steps_per_epoch * num_epochs
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (0/{num_epochs})", position=0)
     for epoch in epochs:
-
         train_start = time.time()
         train_metrics = []
 
@@ -623,7 +622,6 @@ def eval_step(state, batch):
                 train_metrics = []
 
             if (cur_step % training_args.eval_steps == 0 or cur_step % steps_per_epoch == 0) and cur_step > 0:
-
                 # evaluate
                 eval_loader = glue_eval_data_collator(eval_dataset, eval_batch_size)
                 for batch in tqdm(
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index a20aa4cfab5cc7..c7509433d95796 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -28,20 +28,20 @@
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import ClassLabel, load_dataset
-from tqdm import tqdm
-
 import evaluate
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import ClassLabel, load_dataset
 from flax import struct, traverse_util
 from flax.jax_utils import pad_shard_unpad, replicate, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -695,7 +695,6 @@ def compute_metrics():
     total_steps = step_per_epoch * num_epochs
     epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
     for epoch in epochs:
-
         train_start = time.time()
         train_metrics = []
 
@@ -731,7 +730,6 @@ def compute_metrics():
                 train_metrics = []
 
             if cur_step % training_args.eval_steps == 0 and cur_step > 0:
-
                 eval_metrics = {}
                 # evaluate
                 for batch in tqdm(
diff --git a/examples/flax/vision/run_image_classification.py b/examples/flax/vision/run_image_classification.py
index 33a277fa4f8a21..6a88f0f8d67b28 100644
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -29,21 +29,22 @@
 from pathlib import Path
 from typing import Callable, Optional
 
+import jax
+import jax.numpy as jnp
+import optax
+
 # for dataset and preprocessing
 import torch
 import torchvision
 import torchvision.transforms as transforms
-from tqdm import tqdm
-
-import jax
-import jax.numpy as jnp
-import optax
-import transformers
 from flax import jax_utils
 from flax.jax_utils import pad_shard_unpad, unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
 from huggingface_hub import Repository, create_repo
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
diff --git a/examples/legacy/multiple_choice/run_multiple_choice.py b/examples/legacy/multiple_choice/run_multiple_choice.py
index d8007da6cb676c..451397042594f7 100644
--- a/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/examples/legacy/multiple_choice/run_multiple_choice.py
@@ -22,6 +22,7 @@
 from typing import Dict, Optional
 
 import numpy as np
+from utils_multiple_choice import MultipleChoiceDataset, Split, processors
 
 import transformers
 from transformers import (
@@ -36,7 +37,6 @@
     set_seed,
 )
 from transformers.trainer_utils import is_main_process
-from utils_multiple_choice import MultipleChoiceDataset, Split, processors
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/legacy/multiple_choice/utils_multiple_choice.py b/examples/legacy/multiple_choice/utils_multiple_choice.py
index 3dbc3689cc4893..9ffaa7971b5624 100644
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@@ -26,8 +26,8 @@
 from typing import List, Optional
 
 import tqdm
-
 from filelock import FileLock
+
 from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
 
 
@@ -112,7 +112,6 @@ def __init__(
             # and the others will use the cache.
             lock_path = cached_features_file + ".lock"
             with FileLock(lock_path):
-
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
                     self.features = torch.load(cached_features_file)
diff --git a/examples/legacy/pytorch-lightning/lightning_base.py b/examples/legacy/pytorch-lightning/lightning_base.py
index b3104a25a8b129..f246ecab0dd01b 100644
--- a/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/examples/legacy/pytorch-lightning/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -346,7 +346,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/legacy/pytorch-lightning/run_glue.py b/examples/legacy/pytorch-lightning/run_glue.py
index 63b58bcf413c26..aa2349f2809fd4 100644
--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@@ -7,21 +7,19 @@
 
 import numpy as np
 import torch
+from lightning_base import BaseTransformer, add_generic_args, generic_train
 from torch.utils.data import DataLoader, TensorDataset
 
-from lightning_base import BaseTransformer, add_generic_args, generic_train
 from transformers import glue_compute_metrics as compute_metrics
 from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes
+from transformers import glue_output_modes, glue_tasks_num_labels
 from transformers import glue_processors as processors
-from transformers import glue_tasks_num_labels
 
 
 logger = logging.getLogger(__name__)
 
 
 class GLUETransformer(BaseTransformer):
-
     mode = "sequence-classification"
 
     def __init__(self, hparams):
diff --git a/examples/legacy/pytorch-lightning/run_ner.py b/examples/legacy/pytorch-lightning/run_ner.py
index b1bdd125c22eb8..3bcbdfee03b114 100644
--- a/examples/legacy/pytorch-lightning/run_ner.py
+++ b/examples/legacy/pytorch-lightning/run_ner.py
@@ -7,11 +7,10 @@
 
 import numpy as np
 import torch
+from lightning_base import BaseTransformer, add_generic_args, generic_train
 from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import DataLoader, TensorDataset
-
-from lightning_base import BaseTransformer, add_generic_args, generic_train
 from utils_ner import TokenClassificationTask
 
 
diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py
index 674e7a9accbf3a..d966b3f02f0315 100644
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -172,7 +172,6 @@ def train(args, train_dataset, model, tokenizer):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
diff --git a/examples/legacy/question-answering/run_squad_trainer.py b/examples/legacy/question-answering/run_squad_trainer.py
index 314b140e828c59..7e3a6f28e0ba1e 100644
--- a/examples/legacy/question-answering/run_squad_trainer.py
+++ b/examples/legacy/question-answering/run_squad_trainer.py
@@ -30,9 +30,10 @@
     DataCollatorWithPadding,
     HfArgumentParser,
     SquadDataset,
+    Trainer,
+    TrainingArguments,
 )
 from transformers import SquadDataTrainingArguments as DataTrainingArguments
-from transformers import Trainer, TrainingArguments
 from transformers.trainer_utils import is_main_process
 
 
diff --git a/examples/legacy/run_chinese_ref.py b/examples/legacy/run_chinese_ref.py
index f7c09e37ff87d2..7d73580aa21566 100755
--- a/examples/legacy/run_chinese_ref.py
+++ b/examples/legacy/run_chinese_ref.py
@@ -4,6 +4,7 @@
 from typing import List
 
 from ltp import LTP
+
 from transformers import BertTokenizer
 
 
@@ -93,7 +94,6 @@ def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokeni
 
     ref_ids = []
     for input_ids, chinese_word in zip(bert_res, ltp_res):
-
         input_tokens = []
         for id in input_ids:
             token = bert_tokenizer._convert_id_to_token(id)
diff --git a/examples/legacy/seq2seq/finetune_trainer.py b/examples/legacy/seq2seq/finetune_trainer.py
index f174f7fb5018f9..4e186c96d8c218 100755
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@@ -19,9 +19,10 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
-import transformers
 from seq2seq_trainer import Seq2SeqTrainer
 from seq2seq_training_args import Seq2SeqTrainingArguments
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoModelForSeq2SeqLM,
@@ -337,7 +338,6 @@ def main():
         metrics["val_loss"] = round(metrics["val_loss"], 4)
 
         if trainer.is_world_process_zero():
-
             handle_metrics("val", metrics, training_args.output_dir)
             all_metrics.update(metrics)
 
diff --git a/examples/legacy/seq2seq/old_test_calculate_rouge.py b/examples/legacy/seq2seq/old_test_calculate_rouge.py
index 17b87cb481a650..6cc15e02552be1 100644
--- a/examples/legacy/seq2seq/old_test_calculate_rouge.py
+++ b/examples/legacy/seq2seq/old_test_calculate_rouge.py
@@ -16,8 +16,8 @@
 from pathlib import Path
 
 import pandas as pd
-
 from rouge_cli import calculate_rouge_path
+
 from utils import calculate_rouge
 
 
@@ -87,7 +87,6 @@ def test_single_sent_scores_dont_depend_on_newline_sep():
 
 
 def test_pegasus_newline():
-
     pred = [
         """" "a person who has such a video needs to immediately give it to the investigators," prosecutor says .<n> "it is a very disturbing scene," editor-in-chief of bild online tells "erin burnett: outfront" """
     ]
diff --git a/examples/legacy/seq2seq/old_test_datasets.py b/examples/legacy/seq2seq/old_test_datasets.py
index b85d7966e97090..0b907b1ed9fbb6 100644
--- a/examples/legacy/seq2seq/old_test_datasets.py
+++ b/examples/legacy/seq2seq/old_test_datasets.py
@@ -17,11 +17,11 @@
 
 import numpy as np
 import pytest
-from torch.utils.data import DataLoader
-
 from pack_dataset import pack_data_dir
 from parameterized import parameterized
 from save_len_file import save_len_file
+from torch.utils.data import DataLoader
+
 from transformers import AutoTokenizer
 from transformers.models.mbart.modeling_mbart import shift_tokens_right
 from transformers.testing_utils import TestCasePlus, slow
diff --git a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
index beb7f2bc9857fd..4aefeb388be631 100644
--- a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
+++ b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
@@ -18,6 +18,7 @@
 import unittest
 
 from parameterized import parameterized
+
 from transformers import FSMTForConditionalGeneration, FSMTTokenizer
 from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device
 from utils import calculate_bleu
diff --git a/examples/legacy/seq2seq/old_test_seq2seq_examples.py b/examples/legacy/seq2seq/old_test_seq2seq_examples.py
index ecc0524c37d93b..864b97c7466a36 100644
--- a/examples/legacy/seq2seq/old_test_seq2seq_examples.py
+++ b/examples/legacy/seq2seq/old_test_seq2seq_examples.py
@@ -21,6 +21,7 @@
 from parameterized import parameterized
 from run_eval import run_generate
 from run_eval_search import run_search
+
 from transformers.testing_utils import CaptureStdout, TestCasePlus, slow
 from utils import ROUGE_KEYS
 
diff --git a/examples/legacy/seq2seq/pack_dataset.py b/examples/legacy/seq2seq/pack_dataset.py
index 6f226de2cc2ddd..8b069e452a7177 100755
--- a/examples/legacy/seq2seq/pack_dataset.py
+++ b/examples/legacy/seq2seq/pack_dataset.py
@@ -29,7 +29,6 @@
 
 
 def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
-
     finished_src, finished_tgt = [], []
 
     sorted_examples = list(zip(src_examples, tgt_examples))
diff --git a/examples/legacy/seq2seq/run_eval_search.py b/examples/legacy/seq2seq/run_eval_search.py
index e1a0c8660c9bf6..c72f038fc50ab2 100755
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
@@ -20,6 +20,7 @@
 from collections import OrderedDict
 
 from run_eval import datetime_now, run_generate
+
 from utils import ROUGE_KEYS
 
 
diff --git a/examples/legacy/seq2seq/seq2seq_training_args.py b/examples/legacy/seq2seq/seq2seq_training_args.py
index 6ec220181ad90d..1583acd36fc4b7 100644
--- a/examples/legacy/seq2seq/seq2seq_training_args.py
+++ b/examples/legacy/seq2seq/seq2seq_training_args.py
@@ -17,6 +17,7 @@
 from typing import Optional
 
 from seq2seq_trainer import arg_to_scheduler
+
 from transformers import TrainingArguments
 
 
diff --git a/examples/legacy/seq2seq/utils.py b/examples/legacy/seq2seq/utils.py
index e207e4d0dbd068..2655165cf11adf 100644
--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@@ -29,10 +29,10 @@
 import torch.distributed as dist
 from rouge_score import rouge_scorer, scoring
 from sacrebleu import corpus_bleu
+from sentence_splitter import add_newline_to_end_of_each_sentence
 from torch import nn
 from torch.utils.data import Dataset, Sampler
 
-from sentence_splitter import add_newline_to_end_of_each_sentence
 from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
 from transformers.models.bart.modeling_bart import shift_tokens_right
 from transformers.utils import cached_property
@@ -132,7 +132,7 @@ def __init__(
         type_path="train",
         n_obs=None,
         prefix="",
-        **dataset_kwargs
+        **dataset_kwargs,
     ):
         super().__init__()
         self.src_file = Path(data_dir).joinpath(type_path + ".source")
diff --git a/examples/legacy/token-classification/run_ner.py b/examples/legacy/token-classification/run_ner.py
index 477ccb50fb2565..212ea986b4245b 100644
--- a/examples/legacy/token-classification/run_ner.py
+++ b/examples/legacy/token-classification/run_ner.py
@@ -24,6 +24,7 @@
 import numpy as np
 from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
 from torch import nn
+from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
 
 import transformers
 from transformers import (
@@ -38,7 +39,6 @@
     set_seed,
 )
 from transformers.trainer_utils import is_main_process
-from utils_ner import Split, TokenClassificationDataset, TokenClassificationTask
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/legacy/token-classification/run_tf_ner.py b/examples/legacy/token-classification/run_tf_ner.py
index 857d777238f2e2..df4770a70fa44d 100755
--- a/examples/legacy/token-classification/run_tf_ner.py
+++ b/examples/legacy/token-classification/run_tf_ner.py
@@ -24,6 +24,7 @@
 
 import numpy as np
 from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
+from utils_ner import Split, TFTokenClassificationDataset, TokenClassificationTask
 
 from transformers import (
     AutoConfig,
@@ -35,7 +36,6 @@
     TFTrainingArguments,
 )
 from transformers.utils import logging as hf_logging
-from utils_ner import Split, TFTokenClassificationDataset, TokenClassificationTask
 
 
 hf_logging.set_verbosity_info()
diff --git a/examples/legacy/token-classification/tasks.py b/examples/legacy/token-classification/tasks.py
index 409be0715da321..d893a2ab0347df 100644
--- a/examples/legacy/token-classification/tasks.py
+++ b/examples/legacy/token-classification/tasks.py
@@ -3,7 +3,6 @@
 from typing import List, TextIO, Union
 
 from conllu import parse_incr
-
 from utils_ner import InputExample, Split, TokenClassificationTask
 
 
diff --git a/examples/legacy/token-classification/utils_ner.py b/examples/legacy/token-classification/utils_ner.py
index 35fcb5ef5b7d22..2b54c7c4a49159 100644
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -23,6 +23,7 @@
 from typing import List, Optional, Union
 
 from filelock import FileLock
+
 from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
 
 
@@ -240,7 +241,6 @@ def __init__(
             # and the others will use the cache.
             lock_path = cached_features_file + ".lock"
             with FileLock(lock_path):
-
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
                     self.features = torch.load(cached_features_file)
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
index 6ccb5301f43ec3..20ddec4acb9ee1 100644
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -23,10 +23,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import DatasetDict, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
index db6940fe38283a..78979e41553e0d 100644
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -19,6 +19,7 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
+import evaluate
 import numpy as np
 import torch
 from datasets import load_dataset
@@ -33,7 +34,6 @@
     ToTensor,
 )
 
-import evaluate
 import transformers
 from transformers import (
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
index 15b27258d2bd56..3ba79d630e76a3 100644
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@@ -21,8 +21,13 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from torchvision.transforms import (
     CenterCrop,
@@ -35,12 +40,7 @@
 )
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import AutoConfig, AutoImageProcessor, AutoModelForImageClassification, SchedulerType, get_scheduler
 from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
 from transformers.utils.versions import require_version
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 2a20e2ef45f8ec..ae01b7614e63ba 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -30,10 +30,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import torch
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     CONFIG_MAPPING,
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index 1592a72897ee83..998ca60b2700ad 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -33,15 +33,15 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator, DistributedType
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 9639af9513f98c..f44b0e3a01e7de 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -30,9 +30,9 @@
 from typing import Optional
 
 import datasets
+import evaluate
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     CONFIG_MAPPING,
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 12d2bbcb546db7..ee469e48890e74 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -33,15 +33,15 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedType
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator, DistributedType
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 6c178e3862af3d..b0bcc567551cc7 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -30,17 +30,17 @@
 from typing import Optional, Union
 
 import datasets
+import evaluate
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index db9b8701f23eda..dfbfe244e206f8 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -25,11 +25,12 @@
 from typing import Optional
 
 import datasets
+import evaluate
 from datasets import load_dataset
+from trainer_qa import QuestionAnsweringTrainer
+from utils_qa import postprocess_qa_predictions
 
-import evaluate
 import transformers
-from trainer_qa import QuestionAnsweringTrainer
 from transformers import (
     AutoConfig,
     AutoModelForQuestionAnswering,
@@ -45,7 +46,6 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 305d32f0ce3527..4d2f5ef51d99c7 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -25,11 +25,12 @@
 from typing import Optional
 
 import datasets
+import evaluate
 from datasets import load_dataset
+from trainer_qa import QuestionAnsweringTrainer
+from utils_qa import postprocess_qa_predictions_with_beam_search
 
-import evaluate
 import transformers
-from trainer_qa import QuestionAnsweringTrainer
 from transformers import (
     DataCollatorWithPadding,
     EvalPrediction,
@@ -44,7 +45,6 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions_with_beam_search
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index 25b19825adfe49..9372de3298f2c0 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -27,18 +27,19 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from utils_qa import postprocess_qa_predictions_with_beam_search
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     AdamW,
     DataCollatorWithPadding,
@@ -52,7 +53,6 @@
 )
 from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions_with_beam_search
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index ffd62fe01ca445..6cbea37151da8e 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -27,18 +27,19 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
+from utils_qa import postprocess_qa_predictions
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -53,7 +54,6 @@
 )
 from transformers.utils import check_min_version, get_full_repo_name, send_example_telemetry
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index f742f4c7a44ac3..5fe5c1bddc6cf9 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -25,11 +25,11 @@
 from typing import List, Optional, Tuple
 
 import datasets
+import evaluate
 from datasets import load_dataset
+from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
 
-import evaluate
 import transformers
-from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
 from transformers import (
     AutoConfig,
     AutoModelForSeq2SeqLM,
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
index 966c2fe48aafad..b1583aca1f0cac 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@@ -21,17 +21,17 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
+import evaluate
 import numpy as np
 import torch
 from datasets import load_dataset
+from huggingface_hub import hf_hub_download
 from PIL import Image
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import functional
 
-import evaluate
 import transformers
-from huggingface_hub import hf_hub_download
 from transformers import (
     AutoConfig,
     AutoImageProcessor,
diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
index e715f20d5fe034..68919e0cc5c57c 100644
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@@ -22,21 +22,21 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo, hf_hub_download
 from PIL import Image
 from torch.utils.data import DataLoader
 from torchvision import transforms
 from torchvision.transforms import functional
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo, hf_hub_download
 from transformers import (
     AutoConfig,
     AutoImageProcessor,
diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
index 3b6acadec56812..603202e696cf9c 100755
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@@ -24,14 +24,14 @@
 
 import datasets
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
 from datasets import DatasetDict, concatenate_datasets, load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data.dataloader import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from huggingface_hub import Repository, create_repo
 from transformers import (
     AdamW,
     SchedulerType,
@@ -641,7 +641,6 @@ def prepare_dataset(batch):
 
             # update step
             if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-
                 # compute grad norm for monitoring
                 scale = (
                     accelerator.scaler._scale.item()
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index a282f66b8ad349..785609fbd56e88 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -26,11 +26,11 @@
 from typing import Dict, List, Optional, Union
 
 import datasets
+import evaluate
 import numpy as np
 import torch
 from datasets import DatasetDict, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -708,7 +708,6 @@ def compute_metrics(pred):
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
index 3791615bc1981b..c841e99df21557 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -26,10 +26,10 @@
 from typing import Any, Dict, List, Optional, Union
 
 import datasets
+import evaluate
 import torch
 from datasets import DatasetDict, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index d680be320dd55a..b682f89ce5b829 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -25,13 +25,13 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import nltk  # Here to have a nice missing dependency error message early on
 import numpy as np
 from datasets import load_dataset
+from filelock import FileLock
 
-import evaluate
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     AutoModelForSeq2SeqLM,
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index e0c4176ec28904..8f669be72c5831 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -27,20 +27,20 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import nltk
 import numpy as np
 import torch
-from datasets import load_dataset
-from torch.utils.data import DataLoader
-from tqdm.auto import tqdm
-
-import evaluate
-import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
+from datasets import load_dataset
 from filelock import FileLock
 from huggingface_hub import Repository, create_repo
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
index 306cd9a34f161a..d88a2ead64b4ae 100644
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -24,8 +24,8 @@
 from unittest import mock
 
 import torch
-
 from accelerate.utils import write_basic_config
+
 from transformers.testing_utils import TestCasePlus, get_gpu_count, run_command, slow, torch_device
 from transformers.utils import is_apex_available
 
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index 6b933bb86014ee..1e7ab534551192 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -24,10 +24,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index 040cb44daa0a20..03de2cf6b553f4 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -22,17 +22,17 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     AutoConfig,
     AutoModelForSequenceClassification,
diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py
index 6cf3530a87923d..20b38a37cf5bc7 100755
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -25,10 +25,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index cfced5a1de6e57..065880e7e26e78 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -26,10 +26,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import ClassLabel, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index f60ed1fc785d02..ad630472234ea4 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -27,17 +27,17 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import ClassLabel, load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 80787aba24bc57..cd82c779e8724d 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -25,10 +25,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index ece3385748da65..a853d531edb861 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -27,18 +27,18 @@
 from pathlib import Path
 
 import datasets
+import evaluate
 import numpy as np
 import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
 from datasets import load_dataset
+from huggingface_hub import Repository, create_repo
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
-import evaluate
 import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from huggingface_hub import Repository, create_repo
 from transformers import (
     CONFIG_MAPPING,
     MODEL_MAPPING,
@@ -69,7 +69,6 @@
 
 # Parsing input arguments
 def parse_args():
-
     parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
     parser.add_argument(
         "--dataset_name",
@@ -751,5 +750,4 @@ def postprocess_text(preds, labels):
 
 
 if __name__ == "__main__":
-
     main()
diff --git a/examples/research_projects/adversarial/run_hans.py b/examples/research_projects/adversarial/run_hans.py
index 0576471fbc50a6..3affbb7a69257a 100644
--- a/examples/research_projects/adversarial/run_hans.py
+++ b/examples/research_projects/adversarial/run_hans.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 import torch
+from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels
 
 import transformers
 from transformers import (
@@ -35,7 +36,6 @@
     set_seed,
 )
 from transformers.trainer_utils import is_main_process
-from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/research_projects/adversarial/utils_hans.py b/examples/research_projects/adversarial/utils_hans.py
index e54792ad2f82b9..f051e60f84fefd 100644
--- a/examples/research_projects/adversarial/utils_hans.py
+++ b/examples/research_projects/adversarial/utils_hans.py
@@ -20,8 +20,8 @@
 from typing import List, Optional, Union
 
 import tqdm
-
 from filelock import FileLock
+
 from transformers import (
     BartTokenizer,
     BartTokenizerFast,
@@ -134,7 +134,6 @@ def __init__(
             # and the others will use the cache.
             lock_path = cached_features_file + ".lock"
             with FileLock(lock_path):
-
                 if os.path.exists(cached_features_file) and not overwrite_cache:
                     logger.info(f"Loading features from cached file {cached_features_file}")
                     self.features = torch.load(cached_features_file)
diff --git a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
index d4121655e8233d..aad680f201c520 100755
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
@@ -25,14 +25,14 @@
 
 import numpy as np
 import torch
+from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee
+from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
 import transformers
-from pabee.modeling_pabee_albert import AlbertForSequenceClassificationWithPabee
-from pabee.modeling_pabee_bert import BertForSequenceClassificationWithPabee
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -173,7 +173,6 @@ def train(args, train_dataset, model, tokenizer):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
@@ -263,7 +262,6 @@ def train(args, train_dataset, model, tokenizer):
 
 
 def evaluate(args, model, tokenizer, prefix="", patience=0):
-
     if args.model_type == "albert":
         model.albert.set_regression_threshold(args.regression_threshold)
         model.albert.set_patience(patience)
@@ -736,7 +734,6 @@ def main():
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 
         for checkpoint in checkpoints:
-
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
 
diff --git a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
index 22c6f4de06f430..6a084d0741d5f5 100644
--- a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
@@ -4,6 +4,7 @@
 from unittest.mock import patch
 
 import run_glue_with_pabee
+
 from transformers.testing_utils import TestCasePlus
 
 
diff --git a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
index ed2bb11f77b41b..53ba3829b15030 100644
--- a/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
+++ b/examples/research_projects/bertabs/convert_bertabs_original_pytorch_checkpoint.py
@@ -24,9 +24,9 @@
 from collections import namedtuple
 
 import torch
-
 from model_bertabs import BertAbsSummarizer
 from models.model_builder import AbsSummarizer  # The authors' implementation
+
 from transformers import BertTokenizer
 
 
diff --git a/examples/research_projects/bertabs/modeling_bertabs.py b/examples/research_projects/bertabs/modeling_bertabs.py
index a7d8611a265f0d..33e216f4a08117 100644
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@@ -24,10 +24,10 @@
 
 import numpy as np
 import torch
+from configuration_bertabs import BertAbsConfig
 from torch import nn
 from torch.nn.init import xavier_uniform_
 
-from configuration_bertabs import BertAbsConfig
 from transformers import BertConfig, BertModel, PreTrainedModel
 
 
diff --git a/examples/research_projects/bertabs/run_summarization.py b/examples/research_projects/bertabs/run_summarization.py
index fcfae6b8c6c755..82ef8ab39ea9b7 100644
--- a/examples/research_projects/bertabs/run_summarization.py
+++ b/examples/research_projects/bertabs/run_summarization.py
@@ -6,10 +6,10 @@
 from collections import namedtuple
 
 import torch
+from modeling_bertabs import BertAbs, build_predictor
 from torch.utils.data import DataLoader, SequentialSampler
 from tqdm import tqdm
 
-from modeling_bertabs import BertAbs, build_predictor
 from transformers import BertTokenizer
 
 from .utils_summarization import (
@@ -45,7 +45,6 @@ def evaluate(args):
         generated_summaries = []
 
         import nltk
-
         import rouge
 
         nltk.download("punkt")
diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
index 8fc30b912468ba..927a15f9be679f 100644
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 from datasets import ClassLabel, DatasetDict, load_dataset
-
 from evaluate import load
+
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
diff --git a/examples/research_projects/codeparrot/scripts/bpe_training.py b/examples/research_projects/codeparrot/scripts/bpe_training.py
index 8a3d6ee9eec19d..1cbeb4b4ee3240 100644
--- a/examples/research_projects/codeparrot/scripts/bpe_training.py
+++ b/examples/research_projects/codeparrot/scripts/bpe_training.py
@@ -1,7 +1,7 @@
+from arguments import TokenizerTrainingArguments
 from datasets import load_dataset
 from tqdm import tqdm
 
-from arguments import TokenizerTrainingArguments
 from transformers import AutoTokenizer, HfArgumentParser
 from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
 
diff --git a/examples/research_projects/codeparrot/scripts/codeparrot_training.py b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
index b2af8767a217a6..2510e02c94700d 100644
--- a/examples/research_projects/codeparrot/scripts/codeparrot_training.py
+++ b/examples/research_projects/codeparrot/scripts/codeparrot_training.py
@@ -6,16 +6,16 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedType
+from arguments import TrainingArguments
 from datasets import load_dataset
+from huggingface_hub import Repository
 from torch.optim import AdamW
 from torch.utils.data import IterableDataset
 from torch.utils.data.dataloader import DataLoader
 from torch.utils.data.datapipes.iter.combinatorics import ShufflerIterDataPipe
 
 import transformers
-from accelerate import Accelerator, DistributedType
-from arguments import TrainingArguments
-from huggingface_hub import Repository
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, get_scheduler, set_seed
 
 
diff --git a/examples/research_projects/codeparrot/scripts/human_eval.py b/examples/research_projects/codeparrot/scripts/human_eval.py
index d0614134ad4732..157079881d5f73 100644
--- a/examples/research_projects/codeparrot/scripts/human_eval.py
+++ b/examples/research_projects/codeparrot/scripts/human_eval.py
@@ -5,15 +5,15 @@
 from collections import defaultdict
 
 import torch
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from arguments import HumanEvalArguments
 from datasets import load_dataset, load_metric
 from torch.utils.data import IterableDataset
 from torch.utils.data.dataloader import DataLoader
 from tqdm import tqdm
 
 import transformers
-from accelerate import Accelerator
-from accelerate.utils import set_seed
-from arguments import HumanEvalArguments
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, StoppingCriteria, StoppingCriteriaList
 
 
diff --git a/examples/research_projects/codeparrot/scripts/initialize_model.py b/examples/research_projects/codeparrot/scripts/initialize_model.py
index 9d066b19087396..6bf028688f1262 100644
--- a/examples/research_projects/codeparrot/scripts/initialize_model.py
+++ b/examples/research_projects/codeparrot/scripts/initialize_model.py
@@ -1,4 +1,5 @@
 from arguments import InitializationArguments
+
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
 
 
diff --git a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
index 9e1ef11ff07d15..195a9dc8096b21 100644
--- a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
+++ b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
@@ -6,10 +6,9 @@
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 from datasets import Dataset
-from tqdm import tqdm
-
 from datasketch import MinHash, MinHashLSH
 from dpu_utils.utils.iterators import ThreadedIterator
+from tqdm import tqdm
 
 
 NON_ALPHA = re.compile("[^A-Za-z_0-9]")
diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
index 6236a8aad86aa1..07540d0b628433 100644
--- a/examples/research_projects/codeparrot/scripts/preprocessing.py
+++ b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -9,10 +9,10 @@
 from pathlib import Path
 
 import numpy as np
-from datasets import load_dataset
-
 from arguments import PreprocessingArguments
+from datasets import load_dataset
 from minhash_deduplication import deduplicate_dataset
+
 from transformers import AutoTokenizer, HfArgumentParser
 
 
diff --git a/examples/research_projects/codeparrot/scripts/pretokenizing.py b/examples/research_projects/codeparrot/scripts/pretokenizing.py
index 9ebe1e577ddefa..5eb793d10d959c 100644
--- a/examples/research_projects/codeparrot/scripts/pretokenizing.py
+++ b/examples/research_projects/codeparrot/scripts/pretokenizing.py
@@ -1,9 +1,9 @@
 import multiprocessing
 import time
 
+from arguments import PretokenizationArguments
 from datasets import load_dataset
 
-from arguments import PretokenizationArguments
 from transformers import AutoTokenizer, HfArgumentParser
 
 
diff --git a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py b/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
index e4438271355707..aaf53de137f490 100644
--- a/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
+++ b/examples/research_projects/codeparrot/scripts/tests/test_deduplicate.py
@@ -1,7 +1,6 @@
 from unittest import TestCase
 
 from datasets import Dataset
-
 from minhash_deduplication import deduplicate_dataset, make_duplicate_clusters
 
 
diff --git a/examples/research_projects/codeparrot/scripts/validation_loss.py b/examples/research_projects/codeparrot/scripts/validation_loss.py
index 280a79dbed0824..929c2df427e227 100644
--- a/examples/research_projects/codeparrot/scripts/validation_loss.py
+++ b/examples/research_projects/codeparrot/scripts/validation_loss.py
@@ -1,12 +1,12 @@
 import logging
 
 import torch
+from accelerate import Accelerator
+from arguments import EvaluationArguments
 from datasets import load_dataset
 from torch.utils.data import IterableDataset
 from torch.utils.data.dataloader import DataLoader
 
-from accelerate import Accelerator
-from arguments import EvaluationArguments
 from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed
 
 
diff --git a/examples/research_projects/decision_transformer/run_decision_transformer.py b/examples/research_projects/decision_transformer/run_decision_transformer.py
index a1e4785d29fc2e..d6c3e28331259d 100644
--- a/examples/research_projects/decision_transformer/run_decision_transformer.py
+++ b/examples/research_projects/decision_transformer/run_decision_transformer.py
@@ -1,8 +1,8 @@
+import gym
 import numpy as np
 import torch
-
-import gym
 from mujoco_py import GlfwContext
+
 from transformers import DecisionTransformerModel
 
 
diff --git a/examples/research_projects/deebert/src/modeling_highway_bert.py b/examples/research_projects/deebert/src/modeling_highway_bert.py
index 37d81248ed4550..2a881decbbd529 100644
--- a/examples/research_projects/deebert/src/modeling_highway_bert.py
+++ b/examples/research_projects/deebert/src/modeling_highway_bert.py
@@ -229,7 +229,10 @@ def forward(
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+        outputs = (
+            sequence_output,
+            pooled_output,
+        ) + encoder_outputs[
             1:
         ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions), highway exits
diff --git a/examples/research_projects/deebert/src/modeling_highway_roberta.py b/examples/research_projects/deebert/src/modeling_highway_roberta.py
index c8358ac99454fd..c21fb32fde762a 100644
--- a/examples/research_projects/deebert/src/modeling_highway_roberta.py
+++ b/examples/research_projects/deebert/src/modeling_highway_roberta.py
@@ -19,7 +19,6 @@
     ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaModel(DeeBertModel):
-
     config_class = RobertaConfig
     base_model_prefix = "roberta"
 
@@ -36,7 +35,6 @@ def __init__(self, config):
     ROBERTA_START_DOCSTRING,
 )
 class DeeRobertaForSequenceClassification(BertPreTrainedModel):
-
     config_class = RobertaConfig
     base_model_prefix = "roberta"
 
diff --git a/examples/research_projects/deebert/test_glue_deebert.py b/examples/research_projects/deebert/test_glue_deebert.py
index 7a709308e6f716..775c4d70b6523e 100644
--- a/examples/research_projects/deebert/test_glue_deebert.py
+++ b/examples/research_projects/deebert/test_glue_deebert.py
@@ -4,6 +4,7 @@
 from unittest.mock import patch
 
 import run_glue_deebert
+
 from transformers.testing_utils import TestCasePlus, get_gpu_count, require_torch_non_multi_gpu, slow
 
 
@@ -45,7 +46,6 @@ def run_and_check(self, args):
     @slow
     @require_torch_non_multi_gpu
     def test_glue_deebert_train(self):
-
         train_args = """
             --model_type roberta
             --model_name_or_path roberta-base
diff --git a/examples/research_projects/distillation/distiller.py b/examples/research_projects/distillation/distiller.py
index fc5dc58941f7ba..3ef2ba87b2e211 100644
--- a/examples/research_projects/distillation/distiller.py
+++ b/examples/research_projects/distillation/distiller.py
@@ -21,14 +21,14 @@
 
 import psutil
 import torch
+from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
+from lm_seqs_dataset import LmSeqsDataset
 from torch import nn
 from torch.optim import AdamW
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
 
-from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
-from lm_seqs_dataset import LmSeqsDataset
 from transformers import get_linear_schedule_with_warmup
 from utils import logger
 
diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py
index 3acfd468640626..aba91995da0c3f 100644
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -189,7 +189,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
diff --git a/examples/research_projects/distillation/train.py b/examples/research_projects/distillation/train.py
index cc2362888e4725..bb35a1df853943 100644
--- a/examples/research_projects/distillation/train.py
+++ b/examples/research_projects/distillation/train.py
@@ -24,9 +24,9 @@
 
 import numpy as np
 import torch
-
 from distiller import Distiller
 from lm_seqs_dataset import LmSeqsDataset
+
 from transformers import (
     BertConfig,
     BertForMaskedLM,
diff --git a/examples/research_projects/information-gain-filtration/igf/igf.py b/examples/research_projects/information-gain-filtration/igf/igf.py
index 99bd8c2d06d71c..6861467a33592a 100644
--- a/examples/research_projects/information-gain-filtration/igf/igf.py
+++ b/examples/research_projects/information-gain-filtration/igf/igf.py
@@ -5,13 +5,13 @@
 import logging
 import random
 
+import joblib
 import numpy as np
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-import joblib
 from transformers import AdamW, GPT2LMHeadModel, get_linear_schedule_with_warmup
 
 
@@ -119,7 +119,6 @@ def recopy_gpt2(orig_model, device, max_steps):
 
 
 def intermittent_save(contexts, real_perps, past_perps, filename):
-
     """
     save the perplexity differences to filename
 
@@ -152,7 +151,6 @@ def collect_objective_set(
     filename="dev.jbl",
     recopy_model=recopy_gpt2,
 ):
-
     """
     Collect individual IGF values from pre-trained transformer model
     max_steps samples of training data to train secondary model
@@ -271,7 +269,6 @@ def generate_datasets(
 def train_secondary_learner(
     secondary_learner, train_dataset, max_epochs, batch_size, eval_freq=50, igf_model_path="secondary_learner.pt"
 ):
-
     """
     Train the secondary learner (igf_model)
 
diff --git a/examples/research_projects/information-gain-filtration/run_clm_igf.py b/examples/research_projects/information-gain-filtration/run_clm_igf.py
index eae10060b22fd1..c1584a2f89adc1 100644
--- a/examples/research_projects/information-gain-filtration/run_clm_igf.py
+++ b/examples/research_projects/information-gain-filtration/run_clm_igf.py
@@ -28,11 +28,9 @@
 import argparse
 import random
 
+import joblib
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler
-
-import joblib
 from igf.igf import (
     SecondaryLearner,
     collect_objective_set,
@@ -43,6 +41,8 @@
     set_seed,
     train_secondary_learner,
 )
+from torch.utils.data import DataLoader, RandomSampler
+
 from transformers import GPT2LMHeadModel
 
 
@@ -55,7 +55,6 @@ def generate_n_pairs(
     data_file="data/tokenized_stories_train_wikitext103.jbl",
     igf_data_file="igf_context_pairs.jbl",
 ):
-
     """
     Collecting *n* pairs for training the secondary learner
     Args:
diff --git a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py b/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
index b9ff9da28140a4..ac37cbc8600a2f 100644
--- a/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
+++ b/examples/research_projects/jax-projects/big_bird/bigbird_flax.py
@@ -4,8 +4,6 @@
 from functools import partial
 from typing import Callable
 
-from tqdm.auto import tqdm
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
@@ -16,6 +14,8 @@
 from flax.serialization import from_bytes, to_bytes
 from flax.training import train_state
 from flax.training.common_utils import shard
+from tqdm.auto import tqdm
+
 from transformers import BigBirdConfig, FlaxBigBirdForQuestionAnswering
 from transformers.models.big_bird.modeling_flax_big_bird import FlaxBigBirdForQuestionAnsweringModule
 
@@ -98,7 +98,6 @@ def __post_init__(self):
 
 @dataclass
 class DataCollator:
-
     pad_id: int
     max_length: int = 4096  # no dynamic padding on TPUs
 
diff --git a/examples/research_projects/jax-projects/big_bird/evaluate.py b/examples/research_projects/jax-projects/big_bird/evaluate.py
index e3309f494e34b2..32ca5172a5f25c 100644
--- a/examples/research_projects/jax-projects/big_bird/evaluate.py
+++ b/examples/research_projects/jax-projects/big_bird/evaluate.py
@@ -1,8 +1,8 @@
-from datasets import load_from_disk
-
 import jax
 import jax.numpy as jnp
 from bigbird_flax import FlaxBigBirdForNaturalQuestions
+from datasets import load_from_disk
+
 from transformers import BigBirdTokenizerFast
 
 
diff --git a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
index 8d2f69031e2ab4..22dc3e455024c0 100644
--- a/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
+++ b/examples/research_projects/jax-projects/big_bird/prepare_natural_questions.py
@@ -1,10 +1,9 @@
 import os
 
+import jsonlines
 import numpy as np
 from tqdm import tqdm
 
-import jsonlines
-
 
 DOC_STRIDE = 2048
 MAX_LENGTH = 4096
diff --git a/examples/research_projects/jax-projects/big_bird/train.py b/examples/research_projects/jax-projects/big_bird/train.py
index 3d67c9d97f6758..ce37b7f975bb3a 100644
--- a/examples/research_projects/jax-projects/big_bird/train.py
+++ b/examples/research_projects/jax-projects/big_bird/train.py
@@ -1,12 +1,12 @@
 import os
 from dataclasses import replace
 
-from datasets import load_dataset
-
 import jax
 import wandb
 from bigbird_flax import Args, DataCollator, FlaxBigBirdForNaturalQuestions, Trainer, build_tx, train_step, val_step
+from datasets import load_dataset
 from flax import jax_utils
+
 from transformers import BigBirdTokenizerFast
 
 
diff --git a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
index e4bec5e2886691..3c5bdb7b44507c 100755
--- a/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
+++ b/examples/research_projects/jax-projects/dataset-streaming/run_mlm_flax_stream.py
@@ -32,17 +32,17 @@
 from typing import Dict, List, Optional, Tuple
 
 import datasets
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
+from datasets import load_dataset
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
+from tqdm import tqdm
+
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_MASKED_LM_MAPPING,
diff --git a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
index a5a395272fdc22..e60f07bdd06325 100644
--- a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
+++ b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
@@ -20,6 +20,7 @@
 import jax.numpy as jnp
 from configuration_hybrid_clip import HybridCLIPConfig
 from flax.core.frozen_dict import FrozenDict
+
 from transformers import FLAX_MODEL_MAPPING, FlaxCLIPVisionModel
 from transformers.modeling_flax_utils import FlaxPreTrainedModel
 from transformers.models.clip.modeling_flax_clip import FlaxCLIPOutput
@@ -132,7 +133,7 @@ def __init__(
         input_shape: Optional[Tuple] = None,
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3))
diff --git a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py b/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
index 1be46f6af99368..f54641408f80a2 100644
--- a/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
+++ b/examples/research_projects/jax-projects/hybrid_clip/run_hybrid_clip.py
@@ -32,22 +32,22 @@
 from pathlib import Path
 from typing import Callable, Optional
 
-import torch
-from torchvision.datasets import VisionDataset
-from torchvision.io import ImageReadMode, read_image
-from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
-from torchvision.transforms.functional import InterpolationMode
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
 import optax
-import transformers
+import torch
 from flax import jax_utils
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, shard, shard_prng_key
 from modeling_hybrid_clip import FlaxHybridCLIP
+from torchvision.datasets import VisionDataset
+from torchvision.io import ImageReadMode, read_image
+from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
+from torchvision.transforms.functional import InterpolationMode
+from tqdm import tqdm
+
+import transformers
 from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, is_tensorboard_available, set_seed
 
 
diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
index 16eb1007b4c73b..7103b5a28111ff 100644
--- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
+++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
@@ -28,19 +28,19 @@
 from typing import Callable, Optional
 
 import datasets
-import numpy as np
-from datasets import Dataset, load_dataset
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
+import numpy as np
 import optax
-import transformers
+from datasets import Dataset, load_dataset
 from flax.core.frozen_dict import freeze, unfreeze
 from flax.training.common_utils import onehot, stack_forest
 from jax.experimental.maps import mesh
 from jax.experimental.pjit import pjit
 from partitions import set_partitions
+from tqdm import tqdm
+
+import transformers
 from transformers import (
     CONFIG_MAPPING,
     FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
diff --git a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
index 71bf60d2c6027d..5034e1ee9137a2 100755
--- a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
+++ b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
@@ -6,18 +6,18 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 
-import numpy as np
-from datasets import DatasetDict, load_dataset
-from tqdm import tqdm
-
 import flax
 import jax
 import jax.numpy as jnp
 import librosa
+import numpy as np
 import optax
+from datasets import DatasetDict, load_dataset
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
+from tqdm import tqdm
+
 from transformers import (
     FlaxWav2Vec2ForPreTraining,
     HfArgumentParser,
diff --git a/examples/research_projects/longform-qa/eli5_app.py b/examples/research_projects/longform-qa/eli5_app.py
index 7782d6433ba7c5..1bcb6fd20d25fc 100644
--- a/examples/research_projects/longform-qa/eli5_app.py
+++ b/examples/research_projects/longform-qa/eli5_app.py
@@ -1,11 +1,9 @@
 import datasets
+import faiss
 import numpy as np
 import streamlit as st
 import torch
 from elasticsearch import Elasticsearch
-
-import faiss
-import transformers
 from eli5_utils import (
     embed_questions_for_retrieval,
     make_qa_s2s_model,
@@ -13,6 +11,8 @@
     query_es_index,
     query_qa_dense_index,
 )
+
+import transformers
 from transformers import AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer
 
 
diff --git a/examples/research_projects/longform-qa/eli5_utils.py b/examples/research_projects/longform-qa/eli5_utils.py
index 82c4bd8caf20d3..db4eae66041be4 100644
--- a/examples/research_projects/longform-qa/eli5_utils.py
+++ b/examples/research_projects/longform-qa/eli5_utils.py
@@ -5,6 +5,7 @@
 from time import time
 
 import datasets  # noqa: F401
+import faiss  # noqa: F401
 import numpy as np
 import pandas as pd
 import torch
@@ -15,7 +16,6 @@
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from tqdm import tqdm
 
-import faiss  # noqa: F401
 from transformers import AdamW, AutoModel, AutoModelForSeq2SeqLM, AutoTokenizer, get_linear_schedule_with_warmup
 
 
diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py
index cb81402425ff2d..4c5227d2c7e011 100644
--- a/examples/research_projects/luke/run_luke_ner_no_trainer.py
+++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py
@@ -27,14 +27,14 @@
 
 import datasets
 import torch
+from accelerate import Accelerator, DistributedDataParallelKwargs
 from datasets import ClassLabel, load_dataset, load_metric
+from huggingface_hub import Repository
+from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor
 from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator, DistributedDataParallelKwargs
-from huggingface_hub import Repository
-from luke_utils import DataCollatorForLukeTokenClassification, is_punctuation, padding_tensor
 from transformers import (
     AdamW,
     LukeConfig,
diff --git a/examples/research_projects/lxmert/extracting_data.py b/examples/research_projects/lxmert/extracting_data.py
index 9790e20ad86bf9..9c445be336f553 100644
--- a/examples/research_projects/lxmert/extracting_data.py
+++ b/examples/research_projects/lxmert/extracting_data.py
@@ -9,9 +9,9 @@
 import datasets
 import numpy as np
 import torch
-
 from modeling_frcnn import GeneralizedRCNN
 from processing_image import Preprocess
+
 from utils import Config
 
 
diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py
index 33c1133e9589f4..08758b1d3cac06 100644
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@@ -169,7 +169,6 @@ def get_norm(norm, out_channels):
 
 
 def _create_grid_offsets(size: List[int], stride: int, offset: float, device):
-
     grid_height, grid_width = size
     shifts_x = torch.arange(
         offset * stride,
@@ -390,7 +389,6 @@ def assign_boxes_to_levels(
     canonical_box_size: int,
     canonical_level: int,
 ):
-
     box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists]))
     # Eqn.(1) in FPN paper
     level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8))
@@ -1708,9 +1706,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, (
-                    "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint"
-                    .format(pretrained_model_name_or_path + ".index")
+                assert (
+                    from_tf
+                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index"
                 )
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
diff --git a/examples/research_projects/lxmert/utils.py b/examples/research_projects/lxmert/utils.py
index 8e830fb8359d29..2fc6ea2062efd2 100644
--- a/examples/research_projects/lxmert/utils.py
+++ b/examples/research_projects/lxmert/utils.py
@@ -34,14 +34,13 @@
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
 
-import numpy as np
-from PIL import Image
-from tqdm.auto import tqdm
-
 import cv2
+import numpy as np
 import requests
 import wget
 from filelock import FileLock
+from PIL import Image
+from tqdm.auto import tqdm
 from yaml import Loader, dump, load
 
 
@@ -181,7 +180,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
 
     @classmethod
     def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
-
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -225,14 +223,13 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
 
 # quick compare tensors
 def compare(in_tensor):
-
     out_tensor = torch.load("dump.pt", map_location=in_tensor.device)
     n1 = in_tensor.numpy()
     n2 = out_tensor.numpy()[0]
     print(n1.shape, n1[0, 0, :5])
     print(n2.shape, n2[0, 0, :5])
     assert np.allclose(n1, n2, rtol=0.01, atol=0.1), (
-        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} %"
+        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %"
         " element-wise mismatch"
     )
     raise Exception("tensors are all good")
@@ -300,7 +297,6 @@ def get_from_cache(
     user_agent=None,
     local_files_only=False,
 ):
-
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
     if isinstance(cache_dir, Path):
@@ -355,7 +351,6 @@ def get_from_cache(
     # Prevent parallel downloads of the same file with a lock.
     lock_path = cache_path + ".lock"
     with FileLock(lock_path):
-
         # If the download just completed while the lock was activated.
         if os.path.exists(cache_path) and not force_download:
             # Even if returning early like here, the lock will be released.
@@ -406,7 +401,6 @@ def _resumable_file_manager():
 
 
 def url_to_filename(url, etag=None):
-
     url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
diff --git a/examples/research_projects/lxmert/visualizing_image.py b/examples/research_projects/lxmert/visualizing_image.py
index a02dc66dfb7c61..163d661e873ec3 100644
--- a/examples/research_projects/lxmert/visualizing_image.py
+++ b/examples/research_projects/lxmert/visualizing_image.py
@@ -18,6 +18,7 @@
 import colorsys
 import io
 
+import cv2
 import matplotlib as mpl
 import matplotlib.colors as mplc
 import matplotlib.figure as mplfigure
@@ -25,7 +26,6 @@
 import torch
 from matplotlib.backends.backend_agg import FigureCanvasAgg
 
-import cv2
 from utils import img_tensorize
 
 
diff --git a/examples/research_projects/mlm_wwm/run_chinese_ref.py b/examples/research_projects/mlm_wwm/run_chinese_ref.py
index 4d1c9e81e94ac3..eca89df97982da 100644
--- a/examples/research_projects/mlm_wwm/run_chinese_ref.py
+++ b/examples/research_projects/mlm_wwm/run_chinese_ref.py
@@ -3,6 +3,7 @@
 from typing import List
 
 from ltp import LTP
+
 from transformers.models.bert.tokenization_bert import BertTokenizer
 
 
diff --git a/examples/research_projects/mm-imdb/run_mmimdb.py b/examples/research_projects/mm-imdb/run_mmimdb.py
index 9f12257a10a8cb..23b2a65e5c96ac 100644
--- a/examples/research_projects/mm-imdb/run_mmimdb.py
+++ b/examples/research_projects/mm-imdb/run_mmimdb.py
@@ -30,6 +30,7 @@
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
+from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
 
 import transformers
 from transformers import (
@@ -43,7 +44,6 @@
     get_linear_schedule_with_warmup,
 )
 from transformers.trainer_utils import is_main_process
-from utils_mmimdb import ImageEncoder, JsonlDataset, collate_fn, get_image_transforms, get_mmimdb_labels
 
 
 try:
diff --git a/examples/research_projects/movement-pruning/bertarize.py b/examples/research_projects/movement-pruning/bertarize.py
index 623b46b94386fd..0c9cc63571d7c1 100644
--- a/examples/research_projects/movement-pruning/bertarize.py
+++ b/examples/research_projects/movement-pruning/bertarize.py
@@ -22,7 +22,6 @@
 import shutil
 
 import torch
-
 from emmental.modules import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
 
 
diff --git a/examples/research_projects/movement-pruning/counts_parameters.py b/examples/research_projects/movement-pruning/counts_parameters.py
index 0aec3766b3f95c..17ddb029f89780 100644
--- a/examples/research_projects/movement-pruning/counts_parameters.py
+++ b/examples/research_projects/movement-pruning/counts_parameters.py
@@ -19,7 +19,6 @@
 import os
 
 import torch
-
 from emmental.modules import ThresholdBinarizer, TopKBinarizer
 
 
diff --git a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
index 66d78b0c8fdc19..2a3bd763a2de36 100644
--- a/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/configuration_bert_masked.py
@@ -50,7 +50,7 @@ def __init__(
         pruning_method="topK",
         mask_init="constant",
         mask_scale=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
index 4228050fe123b3..d404bf49aaa62d 100644
--- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
@@ -649,7 +649,10 @@ def forward(
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+        outputs = (
+            sequence_output,
+            pooled_output,
+        ) + encoder_outputs[
             1:
         ]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py
index e81cf9209c889d..4ce56e524f714b 100644
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@@ -24,12 +24,12 @@
 
 import numpy as np
 import torch
+from emmental import MaskedBertConfig, MaskedBertForSequenceClassification
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from emmental import MaskedBertConfig, MaskedBertForSequenceClassification
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -228,7 +228,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py
index 1bd501eda51440..a516bb8d585ddd 100644
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -25,12 +25,12 @@
 
 import numpy as np
 import torch
+from emmental import MaskedBertConfig, MaskedBertForQuestionAnswering
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 
-from emmental import MaskedBertConfig, MaskedBertForQuestionAnswering
 from transformers import (
     WEIGHTS_NAME,
     AdamW,
@@ -236,7 +236,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
-
             # Skip past any already trained steps if resuming training
             if steps_trained_in_current_epoch > 0:
                 steps_trained_in_current_epoch -= 1
diff --git a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
index 8f4760580fd9ba..5c1b0da700024b 100644
--- a/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
+++ b/examples/research_projects/onnx/summarization/bart_onnx/generation_onnx.py
@@ -264,7 +264,6 @@ def greedy_search(
 
         past: List[torch.Tensor] = []
         while cur_len < max_length:
-
             logits, past = self._decoder_forward(input_ids, encoder_output, attention_mask, past)
             next_token_logits = logits[:, -1, :]
 
@@ -303,7 +302,6 @@ def _prepare_decoder_input_ids_for_generation(
         decoder_start_token_id,
         bos_token_id: Optional[int] = None,
     ) -> torch.LongTensor:
-
         decoder_input_ids = (
             torch.ones((input_ids.shape[0], 1), dtype=input_ids.dtype, device=input_ids.device)
             * decoder_start_token_id
@@ -633,7 +631,6 @@ def _reorder_cache(self, past: List[torch.Tensor], beam_idx):
     def beam_search(
         self, input_ids, encoder_output, attention_mask, num_beams, max_length, pad_token_id: int, eos_token_id: int
     ):
-
         batch_size = self.beam_scorer.batch_size
 
         num_beams = self.beam_scorer.num_beams
diff --git a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py b/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
index 63fae44ffac6bc..d327cdb2841d9c 100644
--- a/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
+++ b/examples/research_projects/onnx/summarization/bart_onnx/reduce_onnx_size.py
@@ -5,7 +5,6 @@
 import os
 
 import numpy
-
 import onnx
 
 
diff --git a/examples/research_projects/onnx/summarization/run_onnx_exporter.py b/examples/research_projects/onnx/summarization/run_onnx_exporter.py
index 5d751ace8eee10..889eefb4e74b56 100644
--- a/examples/research_projects/onnx/summarization/run_onnx_exporter.py
+++ b/examples/research_projects/onnx/summarization/run_onnx_exporter.py
@@ -22,12 +22,12 @@
 import sys
 
 import numpy as np
-import torch
-
 import onnxruntime
-import transformers
+import torch
 from bart_onnx.generation_onnx import BARTBeamSearchGenerator
 from bart_onnx.reduce_onnx_size import remove_dup_initializers
+
+import transformers
 from transformers import BartForConditionalGeneration, BartTokenizer
 
 
diff --git a/examples/research_projects/performer/modeling_flax_performer.py b/examples/research_projects/performer/modeling_flax_performer.py
index b4b9924fae2716..7c2fde6ddbb5dc 100644
--- a/examples/research_projects/performer/modeling_flax_performer.py
+++ b/examples/research_projects/performer/modeling_flax_performer.py
@@ -15,13 +15,13 @@
 
 from typing import Callable, Dict, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from jax.random import PRNGKey
 from modeling_flax_performer_utils import make_fast_softmax_attention
+
 from transformers.file_utils import add_start_docstrings
 from transformers.modeling_flax_utils import ACT2FN
 from transformers.models.bert.configuration_bert import BertConfig
@@ -366,7 +366,6 @@ def convert_from_pytorch(pt_state: Dict, config: BertConfig) -> Dict:
 
             # SelfAttention needs also to replace "weight" by "kernel"
             if {"query", "key", "value"} & key_parts:
-
                 # Flax SelfAttention decomposes the heads (num_head, size // num_heads)
                 if "bias" in key:
                     jax_state[key] = tensor.reshape((config.num_attention_heads, -1))
@@ -443,7 +442,6 @@ def module(self) -> nn.Module:
     def __call__(
         self, input_ids, token_type_ids=None, position_ids=None, dropout_rng: PRNGKey = None, attention_mask=None
     ):
-
         input_ids, attention_mask, token_type_ids, position_ids = self._check_inputs(
             input_ids, attention_mask, token_type_ids, position_ids
         )
diff --git a/examples/research_projects/performer/modeling_flax_performer_utils.py b/examples/research_projects/performer/modeling_flax_performer_utils.py
index 915e2fa23dd98f..6e6173729cc348 100644
--- a/examples/research_projects/performer/modeling_flax_performer_utils.py
+++ b/examples/research_projects/performer/modeling_flax_performer_utils.py
@@ -30,11 +30,10 @@
 import functools
 from collections.abc import Iterable  # pylint: disable=g-importing-member
 
-import numpy as onp
-from absl import logging
-
 import jax
 import jax.numpy as jnp
+import numpy as onp
+from absl import logging
 from jax import lax, random
 
 
@@ -524,7 +523,6 @@ def dot_product_attention(
         deterministic=False,
         precision=None,
     ):
-
         assert key.shape[:-1] == value.shape[:-1]
         assert query.shape[0:1] == key.shape[0:1] and query.shape[-1] == key.shape[-1]
         if axis is None:
diff --git a/examples/research_projects/performer/run_mlm_performer.py b/examples/research_projects/performer/run_mlm_performer.py
index 35de233f727ea4..1547ead421fd6f 100644
--- a/examples/research_projects/performer/run_mlm_performer.py
+++ b/examples/research_projects/performer/run_mlm_performer.py
@@ -28,18 +28,18 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 
-import numpy as np
-from datasets import load_dataset
-from tqdm import tqdm
-
 import jax
 import jax.numpy as jnp
+import numpy as np
+from datasets import load_dataset
 from flax import jax_utils
 from flax.optim import Adam
 from flax.training import common_utils
 from flax.training.common_utils import get_metrics
 from jax.nn import log_softmax
 from modeling_flax_performer import FlaxPerformerForMaskedLM
+from tqdm import tqdm
+
 from transformers import (
     MODEL_FOR_MASKED_LM_MAPPING,
     AutoTokenizer,
@@ -632,7 +632,6 @@ def tokenize_function(examples):
 
     epochs = tqdm(range(nb_epochs), desc=f"Epoch ... (1/{nb_epochs})", position=0)
     for epoch in epochs:
-
         # ======================== Training ================================
         # Create sampling rng
         rng, training_rng, eval_rng = jax.random.split(rng, 3)
diff --git a/examples/research_projects/pplm/run_pplm.py b/examples/research_projects/pplm/run_pplm.py
index fdbad607201b17..54784b944c71cf 100644
--- a/examples/research_projects/pplm/run_pplm.py
+++ b/examples/research_projects/pplm/run_pplm.py
@@ -30,10 +30,10 @@
 
 import numpy as np
 import torch
+from pplm_classification_head import ClassificationHead
 from torch import nn
 from tqdm import trange
 
-from pplm_classification_head import ClassificationHead
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 from transformers.file_utils import cached_path
 
@@ -345,7 +345,7 @@ def full_text_generation(
     gm_scale=0.9,
     kl_scale=0.01,
     repetition_penalty=1.0,
-    **kwargs
+    **kwargs,
 ):
     classifier, class_id = get_classifier(discrim, class_label, device)
 
@@ -463,7 +463,6 @@ def generate_text_pplm(
     unpert_discrim_loss = 0
     loss_in_time = []
     for i in trange(length, ascii=True):
-
         # Get past/probs for current output, except for last word
         # Note that GPT takes 2 inputs: past + current_token
 
@@ -547,7 +546,6 @@ def generate_text_pplm(
 
         # Fuse the modified model and original model
         if perturb:
-
             unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
 
             pert_probs = (pert_probs**gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
diff --git a/examples/research_projects/pplm/run_pplm_discrim_train.py b/examples/research_projects/pplm/run_pplm_discrim_train.py
index 6a7351d9e6a63a..d53b557d1af031 100644
--- a/examples/research_projects/pplm/run_pplm_discrim_train.py
+++ b/examples/research_projects/pplm/run_pplm_discrim_train.py
@@ -26,12 +26,12 @@
 import torch.optim as optim
 import torch.utils.data as data
 from nltk.tokenize.treebank import TreebankWordDetokenizer
+from pplm_classification_head import ClassificationHead
 from torch import nn
 from torchtext import data as torchtext_data
 from torchtext import datasets
 from tqdm import tqdm, trange
 
-from pplm_classification_head import ClassificationHead
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 
 
diff --git a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
index bd0b1157b01d47..814f95d0ab8f79 100755
--- a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
+++ b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
@@ -21,19 +21,19 @@
 
 import datasets
 import numpy as np
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
 import torch
 from absl import logging as absl_logging
+from accelerate import Accelerator
 from datasets import load_dataset, load_metric
 from torch.utils.data import DataLoader
+from utils_qa import postprocess_qa_predictions
 
-import pycuda.autoinit  # noqa: F401
-import pycuda.driver as cuda
-import tensorrt as trt
 import transformers
-from accelerate import Accelerator
 from transformers import AutoTokenizer, EvalPrediction, default_data_collator, set_seed
 from transformers.trainer_pt_utils import nested_concat, nested_truncate
-from utils_qa import postprocess_qa_predictions
 
 
 TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
@@ -395,7 +395,6 @@ def post_processing_function(examples, features, predictions, stage="eval"):
 with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(
     f.read()
 ) as engine, engine.create_execution_context() as context:
-
     # setup for TRT inferrence
     for i in range(len(input_names)):
         context.set_binding_shape(i, INPUT_SHAPE)
@@ -427,7 +426,6 @@ def binding_nbytes(binding):
 
     all_preds = None
     for step, batch in enumerate(eval_dataloader):
-
         outputs, infer_time = model_infer(batch, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream)
         total_time += infer_time
         niter += 1
diff --git a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
index 4ed4203062c01c..bb0436c125800b 100644
--- a/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
+++ b/examples/research_projects/quantization-qdqbert/ort-infer-benchmark.py
@@ -2,7 +2,6 @@
 import time
 
 import numpy as np
-
 import onnxruntime as ort
 
 
diff --git a/examples/research_projects/quantization-qdqbert/quant_trainer.py b/examples/research_projects/quantization-qdqbert/quant_trainer.py
index ce1ecb6c51feac..9360cc01ba7fa0 100755
--- a/examples/research_projects/quantization-qdqbert/quant_trainer.py
+++ b/examples/research_projects/quantization-qdqbert/quant_trainer.py
@@ -16,10 +16,9 @@
 import logging
 import re
 
-import torch
-
 import pytorch_quantization
 import pytorch_quantization.nn as quant_nn
+import torch
 from pytorch_quantization import calib
 from pytorch_quantization.tensor_quant import QuantDescriptor
 
diff --git a/examples/research_projects/quantization-qdqbert/run_quant_qa.py b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
index 5008197b8b845d..ba5dfe4c090736 100755
--- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
@@ -26,11 +26,12 @@
 from typing import Optional
 
 import datasets
+import quant_trainer
 from datasets import load_dataset, load_metric
+from trainer_quant_qa import QuestionAnsweringTrainer
+from utils_qa import postprocess_qa_predictions
 
-import quant_trainer
 import transformers
-from trainer_quant_qa import QuestionAnsweringTrainer
 from transformers import (
     AutoTokenizer,
     DataCollatorWithPadding,
@@ -46,7 +47,6 @@
 from transformers.trainer_utils import SchedulerType, get_last_checkpoint
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
index ef0d93a7e357cc..9b8c53b272b11b 100644
--- a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
@@ -20,10 +20,10 @@
 import logging
 import os
 
+import quant_trainer
 import torch
 from torch.utils.data import DataLoader
 
-import quant_trainer
 from transformers import Trainer, is_torch_tpu_available
 from transformers.trainer_utils import PredictionOutput
 
diff --git a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
index 5f18244a7aa481..09a30ff6d5c433 100644
--- a/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
+++ b/examples/research_projects/rag-end2end-retriever/callbacks_rag.py
@@ -6,7 +6,6 @@
 import torch
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.utilities import rank_zero_only
-
 from utils_rag import save_json
 
 
diff --git a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py b/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
index 50842f062c997c..f97467292c25bf 100644
--- a/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
+++ b/examples/research_projects/rag-end2end-retriever/distributed_ray_retriever.py
@@ -2,6 +2,7 @@
 import random
 
 import ray
+
 from transformers import RagConfig, RagRetriever, RagTokenizer
 from transformers.models.rag.retrieval_rag import CustomHFIndex
 
@@ -166,7 +167,6 @@ def from_pretrained(cls, retriever_name_or_path, actor_handles, indexed_dataset=
         )
 
     def re_load(self):
-
         logger.info("re-loading the new dataset with embeddings")
         # access from the training loop
 
diff --git a/examples/research_projects/rag-end2end-retriever/finetune_rag.py b/examples/research_projects/rag-end2end-retriever/finetune_rag.py
index 53e1e657f7f1da..8d0ba293b12b90 100644
--- a/examples/research_projects/rag-end2end-retriever/finetune_rag.py
+++ b/examples/research_projects/rag-end2end-retriever/finetune_rag.py
@@ -252,14 +252,12 @@ def pad(self) -> int:
         raise NotImplementedError("pad not implemented")
 
     def training_step(self, batch, batch_idx) -> Dict:
-
         global isEmUpdateBusy  # use to check whether the entire embedding update process is finished or not
         global isAddIndexBusy  # use to check whether the entire indexing process  is finished or not
         global processes  # use to keep threads embedding update processes
         global threadHandle_index  # use to keep thread in embedding indexing processes
 
         if (self.trainer.global_rank == 0) and (self.custom_config.end2end):
-
             if (not batch_idx == 0) and (batch_idx % self.custom_config.indexing_freq == 0):
                 free_gpu_list = []
                 nvmlInit()
@@ -282,7 +280,6 @@ def training_step(self, batch, batch_idx) -> Dict:
                     has_free_gpus = False
 
                 if (not isEmUpdateBusy) and has_free_gpus:
-
                     model_copy = type(self.model.rag.ctx_encoder)(
                         self.config_dpr
                     )  # get a new instance  #this will be load in the CPU
@@ -336,10 +333,8 @@ def training_step(self, batch, batch_idx) -> Dict:
 
             # check when index building has started
             if isAddIndexBusy:
-
                 # check still the index_building process is happening
                 if not threadHandle_index.is_alive():
-
                     logger.info("Merging the dataset shards")
                     saved_dataset_shards = []
 
@@ -494,7 +489,6 @@ def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None:
         self.tokenizer.save_pretrained(save_path)
 
         if self.custom_config.end2end:
-
             modified_state_dict = self.model.state_dict()
             for key in self.model.state_dict().keys():
                 if key.split(".")[1] == "ctx_encoder":
@@ -803,7 +797,6 @@ def main(args=None, model=None) -> GenerativeQAModule:
 
 
 if __name__ == "__main__":
-
     multiprocessing.set_start_method("spawn")
     parser = argparse.ArgumentParser()
     parser = pl.Trainer.add_argparse_args(parser)
diff --git a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py b/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
index 25fa737e5aa3c5..444c07b2bab16a 100644
--- a/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
+++ b/examples/research_projects/rag-end2end-retriever/kb_encode_utils.py
@@ -2,9 +2,9 @@
 from functools import partial
 from glob import glob
 
+import faiss
 from datasets import Features, Sequence, Value, concatenate_datasets, load_dataset, load_from_disk
 
-import faiss
 from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast
 
 
@@ -26,7 +26,6 @@ def split_documents(documents):
 
 
 def embed_update(ctx_encoder, total_processes, device, process_num, shard_dir, csv_path):
-
     kb_dataset = load_dataset(
         "csv", data_files=[csv_path], split="train", delimiter="\t", column_names=["title", "text"]
     )
diff --git a/examples/research_projects/rag-end2end-retriever/lightning_base.py b/examples/research_projects/rag-end2end-retriever/lightning_base.py
index 84842944059a7c..b9f8c6e3d7b5c0 100644
--- a/examples/research_projects/rag-end2end-retriever/lightning_base.py
+++ b/examples/research_projects/rag-end2end-retriever/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -365,7 +365,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
index 432111a2784c37..e0aa86a3a65ba9 100644
--- a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
+++ b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
@@ -6,10 +6,10 @@
 from tempfile import TemporaryDirectory
 from typing import List, Optional
 
+import faiss
 import torch
 from datasets import Features, Sequence, Value, load_dataset
 
-import faiss
 from transformers import DPRContextEncoder, DPRContextEncoderTokenizerFast, HfArgumentParser
 
 
@@ -49,7 +49,6 @@ def main(
     processing_args: "ProcessingArguments",
     index_hnsw_args: "IndexHnswArguments",
 ):
-
     ######################################
     logger.info("Step 1 - Create the dataset")
     ######################################
diff --git a/examples/research_projects/rag/_test_finetune_rag.py b/examples/research_projects/rag/_test_finetune_rag.py
index fa535f2268be34..0906295b301824 100644
--- a/examples/research_projects/rag/_test_finetune_rag.py
+++ b/examples/research_projects/rag/_test_finetune_rag.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 import finetune_rag
+
 from transformers.file_utils import is_apex_available
 from transformers.testing_utils import (
     TestCasePlus,
diff --git a/examples/research_projects/rag/callbacks_rag.py b/examples/research_projects/rag/callbacks_rag.py
index af1595b08efdf6..d75f97995bd16f 100644
--- a/examples/research_projects/rag/callbacks_rag.py
+++ b/examples/research_projects/rag/callbacks_rag.py
@@ -6,7 +6,6 @@
 import torch
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.utilities import rank_zero_only
-
 from utils_rag import save_json
 
 
diff --git a/examples/research_projects/rag/consolidate_rag_checkpoint.py b/examples/research_projects/rag/consolidate_rag_checkpoint.py
index 39ba7e91f6c3a6..6adae75fea9b12 100644
--- a/examples/research_projects/rag/consolidate_rag_checkpoint.py
+++ b/examples/research_projects/rag/consolidate_rag_checkpoint.py
@@ -17,7 +17,6 @@ def consolidate(
     generator_tokenizer_name_or_path: str = None,
     question_encoder_tokenizer_name_or_path: str = None,
 ):
-
     if config_name_or_path is None:
         config_name_or_path = "facebook/rag-token-base" if model_type == "rag_token" else "facebook/rag-sequence-base"
 
diff --git a/examples/research_projects/rag/distributed_ray_retriever.py b/examples/research_projects/rag/distributed_ray_retriever.py
index 9ffc1b1e3845cd..dd5baaf726116f 100644
--- a/examples/research_projects/rag/distributed_ray_retriever.py
+++ b/examples/research_projects/rag/distributed_ray_retriever.py
@@ -2,6 +2,7 @@
 import random
 
 import ray
+
 from transformers import RagConfig, RagRetriever, RagTokenizer
 from transformers.models.rag.retrieval_rag import CustomHFIndex
 
diff --git a/examples/research_projects/rag/lightning_base.py b/examples/research_projects/rag/lightning_base.py
index 77830a4760ad39..e78a7582395875 100644
--- a/examples/research_projects/rag/lightning_base.py
+++ b/examples/research_projects/rag/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -356,7 +356,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/research_projects/rag/test_distributed_retriever.py b/examples/research_projects/rag/test_distributed_retriever.py
index ac54d1f9857f1a..7e75e0a7a7efcc 100644
--- a/examples/research_projects/rag/test_distributed_retriever.py
+++ b/examples/research_projects/rag/test_distributed_retriever.py
@@ -7,10 +7,10 @@
 from unittest import TestCase
 from unittest.mock import patch
 
+import faiss
 import numpy as np
 from datasets import Dataset
 
-import faiss
 from transformers import BartConfig, BartTokenizer, DPRConfig, DPRQuestionEncoderTokenizer, RagConfig
 from transformers.file_utils import is_datasets_available, is_faiss_available, is_psutil_available, is_torch_available
 from transformers.integrations import is_ray_available
diff --git a/examples/research_projects/rag/use_own_knowledge_dataset.py b/examples/research_projects/rag/use_own_knowledge_dataset.py
index dc08f508228abc..84d7c854975f11 100644
--- a/examples/research_projects/rag/use_own_knowledge_dataset.py
+++ b/examples/research_projects/rag/use_own_knowledge_dataset.py
@@ -6,10 +6,10 @@
 from tempfile import TemporaryDirectory
 from typing import List, Optional
 
+import faiss
 import torch
 from datasets import Features, Sequence, Value, load_dataset
 
-import faiss
 from transformers import (
     DPRContextEncoder,
     DPRContextEncoderTokenizerFast,
@@ -56,7 +56,6 @@ def main(
     processing_args: "ProcessingArguments",
     index_hnsw_args: "IndexHnswArguments",
 ):
-
     ######################################
     logger.info("Step 1 - Create the dataset")
     ######################################
diff --git a/examples/research_projects/robust-speech-event/eval.py b/examples/research_projects/robust-speech-event/eval.py
index 32e3d1f2c729f8..a8acca1825d7da 100755
--- a/examples/research_projects/robust-speech-event/eval.py
+++ b/examples/research_projects/robust-speech-event/eval.py
@@ -36,7 +36,6 @@ def log_results(result: Dataset, args: Dict[str, str]):
         target_file = f"log_{dataset_id}_targets.txt"
 
         with open(pred_file, "w") as p, open(target_file, "w") as t:
-
             # mapping function to write output
             def write_to_file(batch, i):
                 p.write(f"{i}" + "\n")
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
index d58e17dd25c2ad..aaacc79cebd7a6 100755
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -25,12 +25,12 @@
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
+import bitsandbytes as bnb
 import datasets
 import numpy as np
 import torch
 from datasets import DatasetDict, load_dataset, load_metric
 
-import bitsandbytes as bnb
 import transformers
 from transformers import (
     AutoConfig,
@@ -717,7 +717,6 @@ def compute_metrics(pred):
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
index ef2529fb09b238..54338f15988154 100644
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
@@ -622,7 +622,6 @@ def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/research_projects/self-training-text-classification/selftraining.py b/examples/research_projects/self-training-text-classification/selftraining.py
index 7fde2fd1b8ed1e..70a6c2f319e0cb 100644
--- a/examples/research_projects/self-training-text-classification/selftraining.py
+++ b/examples/research_projects/self-training-text-classification/selftraining.py
@@ -23,12 +23,12 @@
 from typing import List, Optional
 
 import datasets
+from accelerate import Accelerator
 from datasets import load_dataset
+from finetuning import finetune
 from tqdm.auto import tqdm
 
 import transformers
-from accelerate import Accelerator
-from finetuning import finetune
 from transformers import AutoConfig, set_seed
 from transformers.trainer_utils import IntervalStrategy
 
diff --git a/examples/research_projects/seq2seq-distillation/_test_bash_script.py b/examples/research_projects/seq2seq-distillation/_test_bash_script.py
index 53922f2b645bbc..fa84a60c0c88e0 100644
--- a/examples/research_projects/seq2seq-distillation/_test_bash_script.py
+++ b/examples/research_projects/seq2seq-distillation/_test_bash_script.py
@@ -8,9 +8,9 @@
 import pytorch_lightning as pl
 import timeout_decorator
 import torch
-
 from distillation import SummarizationDistiller, distill_main
 from finetune import SummarizationModule, main
+
 from transformers import MarianMTModel
 from transformers.file_utils import cached_path
 from transformers.testing_utils import TestCasePlus, require_torch_gpu, slow
diff --git a/examples/research_projects/seq2seq-distillation/_test_make_student.py b/examples/research_projects/seq2seq-distillation/_test_make_student.py
index 0a1688a95cc11e..73df66315cbd79 100644
--- a/examples/research_projects/seq2seq-distillation/_test_make_student.py
+++ b/examples/research_projects/seq2seq-distillation/_test_make_student.py
@@ -2,6 +2,7 @@
 import unittest
 
 from make_student import create_student_by_copying_alternating_layers
+
 from transformers import AutoConfig
 from transformers.file_utils import cached_property
 from transformers.testing_utils import require_torch
diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
index d97c9d43b3330d..b1c84ad9b8bdc3 100644
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
@@ -5,18 +5,18 @@
 import tempfile
 from pathlib import Path
 
+import lightning_base
 import pytest
 import pytorch_lightning as pl
 import torch
-from torch import nn
-
-import lightning_base
 from convert_pl_checkpoint_to_hf import convert_pl_to_hf
 from distillation import distill_main
 from finetune import SummarizationModule, main
 from huggingface_hub import list_models
 from parameterized import parameterized
 from run_eval import generate_summaries_or_translations
+from torch import nn
+
 from transformers import AutoConfig, AutoModelForSeq2SeqLM
 from transformers.testing_utils import CaptureStderr, CaptureStdout, TestCasePlus, require_torch_gpu, slow
 from utils import label_smoothed_nll_loss, lmap, load_json
diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
index af6ae24bf4c349..bb06ec8e659fa7 100644
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples_multi_gpu.py
@@ -98,7 +98,6 @@ def setUpClass(cls):
 
     @require_torch_multi_gpu
     def test_multi_gpu(self):
-
         updates = dict(
             no_teacher=True,
             freeze_encoder=True,
diff --git a/examples/research_projects/seq2seq-distillation/distillation.py b/examples/research_projects/seq2seq-distillation/distillation.py
index 78ff49718bb51a..323f62bf45812e 100755
--- a/examples/research_projects/seq2seq-distillation/distillation.py
+++ b/examples/research_projects/seq2seq-distillation/distillation.py
@@ -9,11 +9,11 @@
 
 import pytorch_lightning as pl
 import torch
-from torch import nn
-
 from finetune import SummarizationModule, TranslationModule
 from finetune import main as ft_main
 from make_student import create_student_by_copying_alternating_layers, get_layers_to_supervise
+from torch import nn
+
 from transformers import AutoModelForSeq2SeqLM, MBartTokenizer, T5ForConditionalGeneration
 from transformers.models.bart.modeling_bart import shift_tokens_right
 from utils import calculate_bleu, check_output_dir, freeze_params, label_smoothed_nll_loss, use_task_specific_params
diff --git a/examples/research_projects/seq2seq-distillation/finetune.py b/examples/research_projects/seq2seq-distillation/finetune.py
index c20b361d583631..77f02bef135ed3 100755
--- a/examples/research_projects/seq2seq-distillation/finetune.py
+++ b/examples/research_projects/seq2seq-distillation/finetune.py
@@ -13,10 +13,10 @@
 import numpy as np
 import pytorch_lightning as pl
 import torch
+from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 from torch import nn
 from torch.utils.data import DataLoader
 
-from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
 from transformers import MBartTokenizer, T5ForConditionalGeneration
 from transformers.models.bart.modeling_bart import shift_tokens_right
 from utils import (
diff --git a/examples/research_projects/seq2seq-distillation/lightning_base.py b/examples/research_projects/seq2seq-distillation/lightning_base.py
index b3104a25a8b129..f246ecab0dd01b 100644
--- a/examples/research_projects/seq2seq-distillation/lightning_base.py
+++ b/examples/research_projects/seq2seq-distillation/lightning_base.py
@@ -69,7 +69,7 @@ def __init__(
         config=None,
         tokenizer=None,
         model=None,
-        **config_kwargs
+        **config_kwargs,
     ):
         """Initialize a model, tokenizer and config."""
         super().__init__()
@@ -346,7 +346,7 @@ def generic_train(
     extra_callbacks=[],
     checkpoint_callback=None,
     logging_callback=None,
-    **extra_train_kwargs
+    **extra_train_kwargs,
 ):
     pl.seed_everything(args.seed)
 
diff --git a/examples/research_projects/seq2seq-distillation/make_student.py b/examples/research_projects/seq2seq-distillation/make_student.py
index a4021505b998e0..c1efc1b497abba 100644
--- a/examples/research_projects/seq2seq-distillation/make_student.py
+++ b/examples/research_projects/seq2seq-distillation/make_student.py
@@ -84,7 +84,7 @@ def create_student_by_copying_alternating_layers(
     copy_first_teacher_layers=False,
     e_layers_to_copy=None,
     d_layers_to_copy=None,
-    **extra_config_kwargs
+    **extra_config_kwargs,
 ) -> Tuple[PreTrainedModel, List[int], List[int]]:
     """Make a student by copying alternating layers from a teacher, save it to save_path.
     Args:
@@ -107,7 +107,6 @@ def create_student_by_copying_alternating_layers(
         AutoTokenizer.from_pretrained(teacher).save_pretrained(save_path)  # purely for convenience
         teacher = AutoModelForSeq2SeqLM.from_pretrained(teacher).eval()
     else:
-
         assert isinstance(teacher, PreTrainedModel), f"teacher must be a model or string got type {type(teacher)}"
     init_kwargs = teacher.config.to_diff_dict()
 
diff --git a/examples/research_projects/seq2seq-distillation/utils.py b/examples/research_projects/seq2seq-distillation/utils.py
index a45194e6e054bf..f1a8cef8508ccd 100644
--- a/examples/research_projects/seq2seq-distillation/utils.py
+++ b/examples/research_projects/seq2seq-distillation/utils.py
@@ -15,10 +15,10 @@
 import torch.distributed as dist
 from rouge_score import rouge_scorer, scoring
 from sacrebleu import corpus_bleu
+from sentence_splitter import add_newline_to_end_of_each_sentence
 from torch import nn
 from torch.utils.data import Dataset, Sampler
 
-from sentence_splitter import add_newline_to_end_of_each_sentence
 from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
 from transformers.file_utils import cached_property
 from transformers.models.bart.modeling_bart import shift_tokens_right
@@ -115,7 +115,7 @@ def __init__(
         type_path="train",
         n_obs=None,
         prefix="",
-        **dataset_kwargs
+        **dataset_kwargs,
     ):
         super().__init__()
         self.src_file = Path(data_dir).joinpath(type_path + ".source")
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
index 1d402fa7e8f0e9..a5717d245cb6c9 100644
--- a/examples/research_projects/tapex/run_wikisql_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py
@@ -32,9 +32,10 @@
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from filelock import FileLock
+from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas
 
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     BartForConditionalGeneration,
@@ -48,7 +49,6 @@
 from transformers.file_utils import is_offline_mode
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version
-from wikisql_utils import _TYPE_CONVERTER, retrieve_wikisql_query_answer_tapas
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
index 6f93f9b5166929..901e921f26a694 100644
--- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
@@ -31,9 +31,9 @@
 import numpy as np
 import pandas as pd
 from datasets import load_dataset
+from filelock import FileLock
 
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     BartForConditionalGeneration,
diff --git a/examples/research_projects/visual_bert/extracting_data.py b/examples/research_projects/visual_bert/extracting_data.py
index 9790e20ad86bf9..9c445be336f553 100644
--- a/examples/research_projects/visual_bert/extracting_data.py
+++ b/examples/research_projects/visual_bert/extracting_data.py
@@ -9,9 +9,9 @@
 import datasets
 import numpy as np
 import torch
-
 from modeling_frcnn import GeneralizedRCNN
 from processing_image import Preprocess
+
 from utils import Config
 
 
diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py
index 33c1133e9589f4..08758b1d3cac06 100644
--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@@ -169,7 +169,6 @@ def get_norm(norm, out_channels):
 
 
 def _create_grid_offsets(size: List[int], stride: int, offset: float, device):
-
     grid_height, grid_width = size
     shifts_x = torch.arange(
         offset * stride,
@@ -390,7 +389,6 @@ def assign_boxes_to_levels(
     canonical_box_size: int,
     canonical_level: int,
 ):
-
     box_sizes = torch.sqrt(torch.cat([boxes.area() for boxes in box_lists]))
     # Eqn.(1) in FPN paper
     level_assignments = torch.floor(canonical_level + torch.log2(box_sizes / canonical_box_size + 1e-8))
@@ -1708,9 +1706,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert from_tf, (
-                    "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint"
-                    .format(pretrained_model_name_or_path + ".index")
+                assert (
+                    from_tf
+                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index"
                 )
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
diff --git a/examples/research_projects/visual_bert/utils.py b/examples/research_projects/visual_bert/utils.py
index 8e830fb8359d29..2fc6ea2062efd2 100644
--- a/examples/research_projects/visual_bert/utils.py
+++ b/examples/research_projects/visual_bert/utils.py
@@ -34,14 +34,13 @@
 from urllib.parse import urlparse
 from zipfile import ZipFile, is_zipfile
 
-import numpy as np
-from PIL import Image
-from tqdm.auto import tqdm
-
 import cv2
+import numpy as np
 import requests
 import wget
 from filelock import FileLock
+from PIL import Image
+from tqdm.auto import tqdm
 from yaml import Loader, dump, load
 
 
@@ -181,7 +180,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
 
     @classmethod
     def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
-
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -225,14 +223,13 @@ def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs):
 
 # quick compare tensors
 def compare(in_tensor):
-
     out_tensor = torch.load("dump.pt", map_location=in_tensor.device)
     n1 = in_tensor.numpy()
     n2 = out_tensor.numpy()[0]
     print(n1.shape, n1[0, 0, :5])
     print(n2.shape, n2[0, 0, :5])
     assert np.allclose(n1, n2, rtol=0.01, atol=0.1), (
-        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x == False])/len(n1.flatten())*100:.4f} %"
+        f"{sum([1 for x in np.isclose(n1, n2, rtol=0.01, atol=0.1).flatten() if x is False])/len(n1.flatten())*100:.4f} %"
         " element-wise mismatch"
     )
     raise Exception("tensors are all good")
@@ -300,7 +297,6 @@ def get_from_cache(
     user_agent=None,
     local_files_only=False,
 ):
-
     if cache_dir is None:
         cache_dir = TRANSFORMERS_CACHE
     if isinstance(cache_dir, Path):
@@ -355,7 +351,6 @@ def get_from_cache(
     # Prevent parallel downloads of the same file with a lock.
     lock_path = cache_path + ".lock"
     with FileLock(lock_path):
-
         # If the download just completed while the lock was activated.
         if os.path.exists(cache_path) and not force_download:
             # Even if returning early like here, the lock will be released.
@@ -406,7 +401,6 @@ def _resumable_file_manager():
 
 
 def url_to_filename(url, etag=None):
-
     url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
diff --git a/examples/research_projects/visual_bert/visualizing_image.py b/examples/research_projects/visual_bert/visualizing_image.py
index a02dc66dfb7c61..163d661e873ec3 100644
--- a/examples/research_projects/visual_bert/visualizing_image.py
+++ b/examples/research_projects/visual_bert/visualizing_image.py
@@ -18,6 +18,7 @@
 import colorsys
 import io
 
+import cv2
 import matplotlib as mpl
 import matplotlib.colors as mplc
 import matplotlib.figure as mplfigure
@@ -25,7 +26,6 @@
 import torch
 from matplotlib.backends.backend_agg import FigureCanvasAgg
 
-import cv2
 from utils import img_tensorize
 
 
diff --git a/examples/research_projects/vqgan-clip/VQGAN_CLIP.py b/examples/research_projects/vqgan-clip/VQGAN_CLIP.py
index c936148c554368..b5a23c15b2b1a9 100644
--- a/examples/research_projects/vqgan-clip/VQGAN_CLIP.py
+++ b/examples/research_projects/vqgan-clip/VQGAN_CLIP.py
@@ -1,15 +1,15 @@
 import os
 from glob import glob
 
+import imageio
 import torch
 import torchvision
-from PIL import Image
-from torch import nn
-
-import imageio
 import wandb
 from img_processing import custom_to_pil, loop_post_process, preprocess, preprocess_vqgan
 from loaders import load_vqgan
+from PIL import Image
+from torch import nn
+
 from transformers import CLIPModel, CLIPTokenizerFast
 from utils import get_device, get_timestamp, show_pil
 
diff --git a/examples/research_projects/vqgan-clip/loaders.py b/examples/research_projects/vqgan-clip/loaders.py
index 3fd86522dcad22..e8650f72128456 100644
--- a/examples/research_projects/vqgan-clip/loaders.py
+++ b/examples/research_projects/vqgan-clip/loaders.py
@@ -1,7 +1,6 @@
 import importlib
 
 import torch
-
 import yaml
 from omegaconf import OmegaConf
 from taming.models.vqgan import VQModel
diff --git a/examples/research_projects/wav2vec2/alignment.py b/examples/research_projects/wav2vec2/alignment.py
index 24347a55a0bce7..55b477f5ee967a 100644
--- a/examples/research_projects/wav2vec2/alignment.py
+++ b/examples/research_projects/wav2vec2/alignment.py
@@ -176,7 +176,6 @@ def merge_repeats(path):
                 out_align.write(str(seg) + "\n")
 
     def align_data(self, wav_dir, text_file, output_dir):
-
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
 
diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py
index 692aa39796a769..15d2f12c7ddb56 100755
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -7,13 +7,13 @@
 from typing import Any, Callable, Dict, List, Optional, Set, Union
 
 import datasets
+import librosa
 import numpy as np
 import torch
+from lang_trans import arabic
 from packaging import version
 from torch import nn
 
-import librosa
-from lang_trans import arabic
 from transformers import (
     HfArgumentParser,
     Trainer,
diff --git a/examples/research_projects/wav2vec2/run_pretrain.py b/examples/research_projects/wav2vec2/run_pretrain.py
index 8e0801429e61ec..985e6df40e31d1 100755
--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ b/examples/research_projects/wav2vec2/run_pretrain.py
@@ -4,12 +4,12 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 
+import librosa
 import torch
 from datasets import DatasetDict, load_dataset
 from packaging import version
 from torch import nn
 
-import librosa
 from transformers import (
     HfArgumentParser,
     Trainer,
diff --git a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
index a414f7db9770e6..8f181409d6d7e6 100644
--- a/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
+++ b/examples/research_projects/wav2vec2/test_wav2vec2_deepspeed.py
@@ -126,7 +126,6 @@ def run_and_check(
         quality_checks: bool = True,
         fp16: bool = True,
     ):
-
         model_name = models[model]
 
         output_dir = self.run_trainer(
@@ -151,7 +150,6 @@ def run_trainer(
         distributed: bool = True,
         fp16: bool = True,
     ):
-
         output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
         args = f"""
             --model_name_or_path {model_name}
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index 16fc1ac8a39c32..38ed3376ecc0ba 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -327,7 +327,6 @@ class DataTrainingArguments:
 
 @dataclass
 class SpeechDataCollatorWithPadding:
-
     processor: AutoProcessor
     decoder_start_token_id: Optional[int] = None
     padding: Union[bool, str] = "longest"
@@ -863,7 +862,6 @@ def compute_classification_metric(pred):
 
     # Training
     if training_args.do_train:
-
         # use last checkpoint if exist
         if last_checkpoint is not None:
             checkpoint = last_checkpoint
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index f8d6cf0d1984ab..d9fcc8daafa62d 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -26,12 +26,12 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
+import evaluate
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
 from PIL import Image
 
-import evaluate
 import transformers
 from transformers import (
     MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index fecc55853982b0..f5bc179a96ac11 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -117,6 +117,7 @@ def __call__(self, features):
 
 # endregion
 
+
 # region Arguments
 @dataclass
 class ModelArguments:
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index e5d6018e5c7187..1c3acd34aedd9d 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -26,10 +26,11 @@
 from pathlib import Path
 from typing import Optional
 
+import evaluate
 import tensorflow as tf
 from datasets import load_dataset
+from utils_qa import postprocess_qa_predictions
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
@@ -44,7 +45,6 @@
     set_seed,
 )
 from transformers.utils import CONFIG_NAME, TF2_WEIGHTS_NAME, check_min_version, send_example_telemetry
-from utils_qa import postprocess_qa_predictions
 
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
@@ -214,6 +214,7 @@ def __post_init__(self):
 
 # endregion
 
+
 # region Helper classes
 class SavePretrainedCallback(tf.keras.callbacks.Callback):
     # Hugging Face models have a save_pretrained() method that saves both the weights and the necessary
@@ -610,7 +611,6 @@ def compute_metrics(p: EvalPrediction):
     # endregion
 
     with training_args.strategy.scope():
-
         dataset_options = tf.data.Options()
         dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
         num_replicas = training_args.strategy.num_replicas_in_sync
@@ -628,7 +628,6 @@ def compute_metrics(p: EvalPrediction):
             use_auth_token=True if model_args.use_auth_token else None,
         )
         if training_args.do_train:
-
             training_dataset = model.prepare_tf_dataset(
                 processed_datasets["train"],
                 shuffle=True,
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index b8598e2fdcd9b5..61ee9c2ba6d37f 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -26,14 +26,14 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import nltk  # Here to have a nice missing dependency error message early on
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
+from filelock import FileLock
 
-import evaluate
 import transformers
-from filelock import FileLock
 from transformers import (
     AutoConfig,
     AutoTokenizer,
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index dceb0783f873f1..bf03901011fa4b 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -23,11 +23,11 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
+import evaluate
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index 86fe819e28912a..7b90938f02d7ae 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -26,10 +26,10 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import tensorflow as tf
 from datasets import ClassLabel, load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     CONFIG_MAPPING,
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 206386bc684131..09c0b8a9ea7ed0 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -26,11 +26,11 @@
 from typing import Optional
 
 import datasets
+import evaluate
 import numpy as np
 import tensorflow as tf
 from datasets import load_dataset
 
-import evaluate
 import transformers
 from transformers import (
     AutoConfig,
diff --git a/pyproject.toml b/pyproject.toml
index 291558c9a3deaa..e2d594210f79ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,13 @@
 [tool.black]
 line-length = 119
-target-version = ['py35']
+target-version = ['py37']
+
+[tool.ruff]
+# Never enforce `E501` (line length violations).
+ignore = ["E501", "E741", "W605"]
+select = ["E", "F", "I", "W"]
+line-length = 119
+
+[tool.ruff.isort]
+lines-after-imports = 2
+known-first-party = ["transformers"]
diff --git a/setup.cfg b/setup.cfg
index 2d605ccceca788..9a56ddc2fc65ac 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -4,43 +4,6 @@ ensure_newline_before_comments = True
 force_grid_wrap = 0
 include_trailing_comma = True
 known_first_party = transformers
-known_third_party =
-    absl
-    conllu
-    datasets
-    elasticsearch
-    fairseq
-    faiss-cpu
-    fastprogress
-    fire
-    fugashi
-    git
-    h5py
-    matplotlib
-    nltk
-    numpy
-    packaging
-    pandas
-    PIL
-    psutil
-    pytest
-    pytorch_lightning
-    rouge_score
-    sacrebleu
-    seqeval
-    sklearn
-    streamlit
-    tensorboardX
-    tensorflow
-    tensorflow_datasets
-    timeout_decorator
-    torch
-    torchaudio
-    torchtext
-    torchvision
-    torch_xla
-    tqdm
-
 line_length = 119
 lines_after_imports = 2
 multi_line_output = 3
diff --git a/setup.py b/setup.py
index c5085c14874ef3..aad145b145f275 100644
--- a/setup.py
+++ b/setup.py
@@ -98,7 +98,7 @@
 _deps = [
     "Pillow",
     "accelerate>=0.10.0",
-    "black==22.3",  # after updating to black 2023, also update Python version in pyproject.toml to 3.7
+    "black~=23.1",
     "codecarbon==1.2.0",
     "cookiecutter==1.7.3",
     "dataclasses",
@@ -111,7 +111,6 @@
     "faiss-cpu",
     "fastapi",
     "filelock",
-    "flake8>=3.8.3",
     "flax>=0.4.1",
     "ftfy",
     "fugashi>=1.0",
@@ -150,6 +149,7 @@
     "requests",
     "rjieba",
     "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
+    "ruff>=0.0.241",
     "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses",
     "safetensors>=0.2.1",
@@ -321,7 +321,7 @@ def run(self):
 
 extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
 
-extras["quality"] = deps_list("black", "datasets", "isort", "flake8", "GitPython", "hf-doc-builder")
+extras["quality"] = deps_list("black", "datasets", "isort", "ruff", "GitPython", "hf-doc-builder")
 
 extras["all"] = (
     extras["tf"]
diff --git a/src/transformers/benchmark/benchmark.py b/src/transformers/benchmark/benchmark.py
index 7f95e4b40b7cd3..3c5c877a454e63 100644
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -48,7 +48,6 @@
 
 
 class PyTorchBenchmark(Benchmark):
-
     args: PyTorchBenchmarkArguments
     configs: PretrainedConfig
     framework: str = "PyTorch"
diff --git a/src/transformers/benchmark/benchmark_args.py b/src/transformers/benchmark/benchmark_args.py
index 26c0eb95a4bcca..b5887e4a9bcb4b 100644
--- a/src/transformers/benchmark/benchmark_args.py
+++ b/src/transformers/benchmark/benchmark_args.py
@@ -33,7 +33,6 @@
 
 @dataclass
 class PyTorchBenchmarkArguments(BenchmarkArguments):
-
     deprecated_args = [
         "no_inference",
         "no_cuda",
diff --git a/src/transformers/benchmark/benchmark_args_tf.py b/src/transformers/benchmark/benchmark_args_tf.py
index 12cb6f5cbbeb86..c1c2ec16ce550c 100644
--- a/src/transformers/benchmark/benchmark_args_tf.py
+++ b/src/transformers/benchmark/benchmark_args_tf.py
@@ -30,7 +30,6 @@
 
 @dataclass
 class TensorFlowBenchmarkArguments(BenchmarkArguments):
-
     deprecated_args = [
         "no_inference",
         "no_cuda",
diff --git a/src/transformers/benchmark/benchmark_tf.py b/src/transformers/benchmark/benchmark_tf.py
index b5fd4b71b562a2..126172ffbd3000 100644
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -77,7 +77,6 @@ def random_input_ids(batch_size: int, sequence_length: int, vocab_size: int) ->
 
 
 class TensorFlowBenchmark(Benchmark):
-
     args: TensorFlowBenchmarkArguments
     configs: PretrainedConfig
     framework: str = "TensorFlow"
diff --git a/src/transformers/benchmark/benchmark_utils.py b/src/transformers/benchmark/benchmark_utils.py
index 79740805807185..a6c6353c19fb37 100644
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -890,7 +890,6 @@ def save_to_csv(self, result_dict, filename):
             return
         self.print_fn("Saving results to csv.")
         with open(filename, mode="w") as csv_file:
-
             assert len(self.args.model_names) > 0, f"At least 1 model should be defined, but got {self.model_names}"
 
             fieldnames = ["model", "batch_size", "sequence_length"]
diff --git a/src/transformers/commands/convert.py b/src/transformers/commands/convert.py
index 8c3e37bfcf3d1c..a8cf431cb0876b 100644
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@@ -71,7 +71,7 @@ def __init__(
         pytorch_dump_output: str,
         config: str,
         finetuning_task_name: str,
-        *args
+        *args,
     ):
         self._logger = logging.get_logger("transformers-cli/converting")
 
diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py
index b65249f290856a..669f7a98003bac 100644
--- a/src/transformers/commands/pt_to_tf.py
+++ b/src/transformers/commands/pt_to_tf.py
@@ -17,11 +17,10 @@
 from argparse import ArgumentParser, Namespace
 from importlib import import_module
 
+import huggingface_hub
 import numpy as np
 from packaging import version
 
-import huggingface_hub
-
 from .. import (
     FEATURE_EXTRACTOR_MAPPING,
     IMAGE_PROCESSOR_MAPPING,
@@ -145,7 +144,6 @@ def find_pt_tf_differences(pt_outputs, tf_outputs):
 
         # 2. For each output attribute, computes the difference
         def _find_pt_tf_differences(pt_out, tf_out, differences, attr_name=""):
-
             # If the current attribute is a tensor, it is a leaf and we make the comparison. Otherwise, we will dig in
             # recursivelly, keeping the name of the attribute.
             if isinstance(pt_out, torch.Tensor):
@@ -177,7 +175,7 @@ def __init__(
         no_pr: bool,
         push: bool,
         extra_commit_description: str,
-        *args
+        *args,
     ):
         self._logger = logging.get_logger("transformers-cli/pt_to_tf")
         self._model_name = model_name
diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py
index 4deae833f712e1..803ae71d1c122e 100644
--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@@ -122,7 +122,6 @@ def register_subcommand(parser: ArgumentParser):
         serve_parser.set_defaults(func=serve_command_factory)
 
     def __init__(self, pipeline: Pipeline, host: str, port: int, workers: int):
-
         self._pipeline = pipeline
 
         self.host = host
diff --git a/src/transformers/convert_graph_to_onnx.py b/src/transformers/convert_graph_to_onnx.py
index ce4350b3b1f284..5449d98237ea64 100644
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -328,7 +328,6 @@ def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
 
     try:
         import tensorflow as tf
-
         import tf2onnx
         from tf2onnx import __version__ as t2ov
 
@@ -358,7 +357,7 @@ def convert(
     tokenizer: Optional[str] = None,
     use_external_format: bool = False,
     pipeline_name: str = "feature-extraction",
-    **model_kwargs
+    **model_kwargs,
 ):
     """
     Convert the pipeline object to the ONNX Intermediate Representation (IR) format
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index 62a071dd3cc135..f1358408a5cb57 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -358,7 +358,6 @@ def convert_all_pt_checkpoints_to_tf(
     remove_cached_files=False,
     only_convert_finetuned_models=False,
 ):
-
     if args_model_type is None:
         model_types = list(MODEL_CLASSES.keys())
     else:
diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py
index befa22c2e1787b..72df3bece21925 100644
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@@ -20,9 +20,8 @@
 from typing import List, Optional, Union
 
 import torch
-from torch.utils.data import Dataset
-
 from filelock import FileLock
+from torch.utils.data import Dataset
 
 from ...tokenization_utils_base import PreTrainedTokenizerBase
 from ...utils import logging
@@ -121,7 +120,6 @@ def __init__(
         # and the others will use the cache.
         lock_path = cached_features_file + ".lock"
         with FileLock(lock_path):
-
             if os.path.exists(cached_features_file) and not args.overwrite_cache:
                 start = time.time()
                 self.features = torch.load(cached_features_file)
diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py
index 8c6c98b264f363..6c23bf23cf14d4 100644
--- a/src/transformers/data/datasets/language_modeling.py
+++ b/src/transformers/data/datasets/language_modeling.py
@@ -21,9 +21,8 @@
 from typing import Dict, List, Optional
 
 import torch
-from torch.utils.data import Dataset
-
 from filelock import FileLock
+from torch.utils.data import Dataset
 
 from ...tokenization_utils import PreTrainedTokenizer
 from ...utils import logging
@@ -72,7 +71,6 @@ def __init__(
         # and the others will use the cache.
         lock_path = cached_features_file + ".lock"
         with FileLock(lock_path):
-
             if os.path.exists(cached_features_file) and not overwrite_cache:
                 start = time.time()
                 with open(cached_features_file, "rb") as handle:
diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py
index e1c8c9cb6c0500..d81217d818afff 100644
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -19,9 +19,8 @@
 from typing import Dict, List, Optional, Union
 
 import torch
-from torch.utils.data import Dataset
-
 from filelock import FileLock
+from torch.utils.data import Dataset
 
 from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
 from ...tokenization_utils import PreTrainedTokenizer
diff --git a/src/transformers/data/metrics/__init__.py b/src/transformers/data/metrics/__init__.py
index aca2b7ffd43cb2..c9c998867ae0e1 100644
--- a/src/transformers/data/metrics/__init__.py
+++ b/src/transformers/data/metrics/__init__.py
@@ -20,9 +20,8 @@
 
 
 if is_sklearn_available():
-    from sklearn.metrics import f1_score, matthews_corrcoef
-
     from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import f1_score, matthews_corrcoef
 
 
 DEPRECATION_WARNING = (
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 64137c95aca266..0f8bd248055115 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -166,7 +166,6 @@ def squad_convert_example_to_features(
 
     span_doc_tokens = all_doc_tokens
     while len(spans) * doc_stride < len(all_doc_tokens):
-
         # Define the side we want to truncate / pad and the text/pair sorting
         if tokenizer.padding_side == "right":
             texts = truncated_query
diff --git a/src/transformers/data/test_generation_utils.py b/src/transformers/data/test_generation_utils.py
index b08dd88026ba45..a69b5683de755d 100644
--- a/src/transformers/data/test_generation_utils.py
+++ b/src/transformers/data/test_generation_utils.py
@@ -73,7 +73,6 @@ def test_postprocess_next_token_scores(self):
 
     @timeout_decorator.timeout(10)
     def test_postprocess_next_token_scores_large_bad_words_list(self):
-
         config = self.config
         model = self.model
         # Initialize an input id tensor with batch size 8 and sequence length 12
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 9465307f5f57ea..037c5985f88b94 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -344,7 +344,6 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf
     deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
 
     if resume_from_checkpoint is not None:
-
         # it's possible that the user is trying to resume from model_path, which doesn't necessarily
         # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
         # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 1a256380f0f6d7..d60da273bf001d 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -4,7 +4,7 @@
 deps = {
     "Pillow": "Pillow",
     "accelerate": "accelerate>=0.10.0",
-    "black": "black==22.3",
+    "black": "black~=23.1",
     "codecarbon": "codecarbon==1.2.0",
     "cookiecutter": "cookiecutter==1.7.3",
     "dataclasses": "dataclasses",
@@ -17,7 +17,6 @@
     "faiss-cpu": "faiss-cpu",
     "fastapi": "fastapi",
     "filelock": "filelock",
-    "flake8": "flake8>=3.8.3",
     "flax": "flax>=0.4.1",
     "ftfy": "ftfy",
     "fugashi": "fugashi>=1.0",
@@ -56,6 +55,7 @@
     "requests": "requests",
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
+    "ruff": "ruff>=0.0.241",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",
     "safetensors": "safetensors>=0.2.1",
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index 8b720e6a77358c..d29675f75754d0 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -98,7 +98,7 @@ def process(
         next_scores: torch.FloatTensor,
         next_tokens: torch.LongTensor,
         next_indices: torch.LongTensor,
-        **kwargs
+        **kwargs,
     ) -> Tuple[torch.Tensor]:
         raise NotImplementedError("This is an abstract method.")
 
@@ -111,7 +111,7 @@ def finalize(
         next_tokens: torch.LongTensor,
         next_indices: torch.LongTensor,
         max_length: int,
-        **kwargs
+        **kwargs,
     ) -> torch.LongTensor:
         raise NotImplementedError("This is an abstract method.")
 
@@ -574,7 +574,6 @@ def process(
                 batch_beam_idx = batch_idx * self.group_size + next_index
                 # add to generated hypotheses if end of sentence
                 if (eos_token_id is not None) and (next_token.item() in eos_token_id):
-
                     # if beam_token does not belong to top num_beams tokens, it should not be added
                     is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
                     if is_beam_token_worse_than_top_num_beams:
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index e01b0dc40aa70a..f2b6a76e11e774 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -322,7 +322,7 @@ def save_pretrained(
         save_directory: Union[str, os.PathLike],
         config_file_name: Optional[Union[str, os.PathLike]] = None,
         push_to_hub: bool = False,
-        **kwargs
+        **kwargs,
     ):
         r"""
         Save a generation configuration object to the directory `save_directory`, so that it can be re-loaded using the
@@ -372,7 +372,7 @@ def from_pretrained(
         cls,
         pretrained_model_name: Union[str, os.PathLike],
         config_file_name: Optional[Union[str, os.PathLike]] = None,
-        **kwargs
+        **kwargs,
     ) -> "GenerationConfig":
         r"""
         Instantiate a [`GenerationConfig`] from a generation configuration file.
diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py
index 92e3cff82cb5c2..ccbe7408ff095a 100644
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@@ -258,7 +258,6 @@ def __init__(self, min_length: int, eos_token_id: int):
         self.eos_token_id = eos_token_id
 
     def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray:
-
         # create boolean flag to decide if min length penalty should be applied
         apply_penalty = 1 - jnp.clip(cur_len - self.min_length, 0, 1)
 
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index 1cfc07b9786c8b..b615b3fa3592ee 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -21,11 +21,10 @@
 from functools import partial
 from typing import Any, Dict, Optional, Union
 
-import numpy as np
-
 import flax
 import jax
 import jax.numpy as jnp
+import numpy as np
 from jax import lax
 
 from ..models.auto import (
diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py
index 19a14c63b2c877..369c969526a4f7 100644
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@@ -296,7 +296,6 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
     """
 
     def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
-
         if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
             raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
         if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
@@ -421,7 +420,6 @@ def _get_generated_ngrams(hypo_idx):
         return banned_tokens
 
     def __call__(self, input_ids: tf.Tensor, scores: tf.Tensor, cur_len: int) -> tf.Tensor:
-
         # TODO (joao): enable XLA on this logits processor. See discussion and attempts in
         # https://github.com/huggingface/transformers/pull/16974
         if not tf.executing_eagerly():
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index ec246aaff0e243..b61398163f5435 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -951,7 +951,6 @@ def _prepare_decoder_input_ids_for_generation(
         bos_token_id: int = None,
         model_kwargs: Optional[Dict[str, tf.Tensor]] = None,
     ) -> tf.Tensor:
-
         # prepare `input_ids` for decoder if model is encoder-decoder
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
             return model_kwargs.pop("decoder_input_ids")
@@ -2459,7 +2458,6 @@ def contrastive_search_body_fn(
             # if the first step in the loop, encode all the prefix and obtain: (1) past_key_values;
             # (2) last_hidden_states; (3) logit_for_next_step; (4) update model kwargs for the next step
             if model_kwargs.get("past_key_values") is None:
-
                 # prepare inputs
                 model_inputs = self.prepare_inputs_for_generation(
                     generated[:, :cur_len], use_cache=use_cache, **model_kwargs
@@ -2676,7 +2674,13 @@ def contrastive_search_body_fn(
             generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables
         ):
             maximum_iterations = max_length - cur_len
-            generated, _, cur_len, _, _, = tf.while_loop(
+            (
+                generated,
+                _,
+                cur_len,
+                _,
+                _,
+            ) = tf.while_loop(
                 contrastive_search_cond_fn,
                 contrastive_search_body_fn,
                 (generated, finished_sequences, cur_len, model_kwargs, next_step_cached_variables),
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index f81d76a966c913..9fd563fe388a57 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -17,9 +17,8 @@
 from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple, Union
 
 import numpy as np
-from packaging import version
-
 import requests
+from packaging import version
 
 from .utils import (
     ExplicitEnum,
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index f1fa04d1936a76..b1ebdba9a07f27 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -354,6 +354,7 @@ def dynamic_modules_import_trainable(*args, **kwargs):
 
 def run_hp_search_sigopt(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
     import sigopt
+
     from transformers.utils.versions import importlib_metadata
 
     if trainer.args.process_index == 0:
@@ -723,7 +724,6 @@ def setup(self, args, state, model, **kwargs):
                     init_args["name"] = args.run_name
 
             if self._wandb.run is None:
-
                 self._wandb.init(
                     project=os.getenv("WANDB_PROJECT", "huggingface"),
                     **init_args,
@@ -1158,7 +1158,7 @@ def __init__(
         run: Optional["Run"] = None,
         log_parameters: bool = True,
         log_checkpoints: Optional[str] = None,
-        **neptune_run_kwargs
+        **neptune_run_kwargs,
     ):
         if not is_neptune_available():
             raise ValueError(
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index f99bd738ea7c58..4fd2da18a6b5e3 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -6,11 +6,10 @@
 
 import numpy as np
 import tensorflow as tf
+from huggingface_hub import Repository, create_repo
 from packaging.version import parse
 from tensorflow.keras.callbacks import Callback
 
-from huggingface_hub import Repository, create_repo
-
 from . import IntervalStrategy, PreTrainedTokenizerBase
 from .modelcard import TrainingSummary
 from .utils import get_full_repo_name
@@ -320,7 +319,7 @@ def __init__(
         hub_model_id: Optional[str] = None,
         hub_token: Optional[str] = None,
         checkpoint: bool = False,
-        **model_card_args
+        **model_card_args,
     ):
         super().__init__()
         if checkpoint and save_strategy != "epoch":
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index 47da8c2871b321..e013e74eef4d09 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -19,14 +19,14 @@
 from pickle import UnpicklingError
 from typing import Dict, Tuple
 
-import numpy as np
-
 import jax
 import jax.numpy as jnp
-import transformers
+import numpy as np
 from flax.serialization import from_bytes
 from flax.traverse_util import flatten_dict, unflatten_dict
 
+import transformers
+
 from .utils import logging
 
 
@@ -130,7 +130,6 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model):
 
     # Need to change some parameters name to match Flax names
     for pt_key, pt_tensor in pt_state_dict.items():
-
         pt_tuple_key = tuple(pt_key.split("."))
 
         # remove base model prefix if necessary
@@ -187,7 +186,6 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
         )
         # Need to change some parameters name to match Flax names
         for pt_key, pt_tensor in pt_state_dict.items():
-
             pt_tuple_key = tuple(pt_key.split("."))
 
             # remove base model prefix if necessary
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index c501764350c296..a635c7b62b322e 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -484,9 +484,8 @@ def from_pretrained(
         pretrained_model_name_or_path: Union[str, os.PathLike],
         dtype: jnp.dtype = jnp.float32,
         *model_args,
-        **kwargs
+        **kwargs,
     ):
-
         r"""
         Instantiate a pretrained flax model from a pre-trained model configuration.
 
@@ -810,7 +809,6 @@ def from_pretrained(
         if from_pt:
             state = load_pytorch_checkpoint_in_flax_state_dict(model, resolved_archive_file, is_sharded)
         else:
-
             if is_sharded:
                 state = cls.load_flax_sharded_weights(resolved_archive_file)
             else:
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 42a6adea12d1fc..1a313ec959cb8d 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -30,9 +30,9 @@
 import h5py
 import numpy as np
 import tensorflow as tf
+from huggingface_hub import Repository, list_repo_files
 from packaging.version import parse
 
-from huggingface_hub import Repository, list_repo_files
 from transformers.utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
 
 from . import DataCollatorWithPadding, DefaultDataCollator
@@ -1402,7 +1402,7 @@ def compile(
         weighted_metrics=None,
         run_eagerly=None,
         steps_per_execution=None,
-        **kwargs
+        **kwargs,
     ):
         """
         This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss
@@ -1501,7 +1501,6 @@ def train_step(self, data):
         # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments,
         # if those keys are not already present in the input dict
         if self._using_dummy_loss and y is not None:
-
             # If y is a tensor and the model only has one label-like input, map y to that input
             if len(label_kwargs) == 1 and isinstance(y, tf.Tensor):
                 if isinstance(x, tf.Tensor):
@@ -2228,7 +2227,7 @@ def save_pretrained(
         max_shard_size: Union[int, str] = "10GB",
         create_pr: bool = False,
         safe_serialization: bool = False,
-        **kwargs
+        **kwargs,
     ):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
@@ -2893,7 +2892,7 @@ def push_to_hub(
         private: Optional[bool] = None,
         use_auth_token: Optional[Union[bool, str]] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
-        **model_card_kwargs
+        **model_card_kwargs,
     ) -> str:
         """
         Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`.
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 484467c0c13536..e03f87f92f9c4c 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -962,7 +962,6 @@ def floating_point_ops(
 
 class BackboneMixin:
     def forward_with_filtered_kwargs(self, *args, **kwargs):
-
         signature = dict(inspect.signature(self.forward).parameters)
         filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
 
@@ -2520,7 +2519,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 )
                 raise
         elif from_pt:
-
             # restore default dtype
             if dtype_orig is not None:
                 torch.set_default_dtype(dtype_orig)
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index d11d49a2c78fcc..fd0c6238879257 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -132,7 +132,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=2,
         eos_token_id=3,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index dc9559ac3624ef..687a927ef0c486 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -631,7 +631,6 @@ class AlbertForPreTrainingOutput(ModelOutput):
     ALBERT_START_DOCSTRING,
 )
 class AlbertModel(AlbertPreTrainedModel):
-
     config_class = AlbertConfig
     base_model_prefix = "albert"
 
@@ -912,7 +911,6 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
     ALBERT_START_DOCSTRING,
 )
 class AlbertForMaskedLM(AlbertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [
         "predictions.decoder.weight",
@@ -1133,7 +1131,6 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForTokenClassification(AlbertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config: AlbertConfig):
@@ -1219,7 +1216,6 @@ def forward(
     ALBERT_START_DOCSTRING,
 )
 class AlbertForQuestionAnswering(AlbertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config: AlbertConfig):
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index e55038c8acff72..0ff1b9276a19d6 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -15,12 +15,11 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -523,7 +522,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -702,7 +701,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.albert(
             input_ids,
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 823c7c48bbb572..247ee395dc60fe 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -581,7 +581,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 5bebb936cf7d11..b043a14989fc57 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -147,7 +147,7 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index c2ffcd90b192e9..16c54e7eac6c94 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -135,7 +135,7 @@ def __init__(
         pad_token="<pad>",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py
index 25d7fa69c18ce0..523cf420e0aed5 100755
--- a/src/transformers/models/altclip/configuration_altclip.py
+++ b/src/transformers/models/altclip/configuration_altclip.py
@@ -117,7 +117,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         project_dim=768,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -208,7 +208,7 @@ def __init__(
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -228,7 +228,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from AltCLIPConfig
diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py
index d21e329460cd3d..7d150d5734e1e8 100755
--- a/src/transformers/models/altclip/modeling_altclip.py
+++ b/src/transformers/models/altclip/modeling_altclip.py
@@ -637,7 +637,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
index 19f85189ad0dbd..22b0ca70ac8520 100644
--- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py
@@ -105,7 +105,7 @@ def __init__(
         time_stride=10,
         max_length=1024,
         num_mel_bins=128,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
index f339bbc6c2bf53..32e0f33d04fdb2 100644
--- a/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
+++ b/src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py
@@ -22,8 +22,8 @@
 import torch
 import torchaudio
 from datasets import load_dataset
-
 from huggingface_hub import hf_hub_download
+
 from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification
 from transformers.utils import logging
 
diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
index f61f62b4783e96..deda2fc7781b28 100644
--- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py
@@ -73,7 +73,7 @@ def __init__(
         mean=-4.2677393,
         std=4.5689974,
         return_attention_mask=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
@@ -127,7 +127,7 @@ def __call__(
         raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
index de67d477445069..54b77df7458d2c 100644
--- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
+++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py
@@ -188,7 +188,6 @@ def __init__(self, config: ASTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -246,7 +245,6 @@ def __init__(self, config: ASTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 2558311a44bd78..2a04657f419909 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -139,7 +139,7 @@ def __init__(
         is_encoder_decoder=True,
         decoder_start_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 136a5eebc40090..2aac055f85f0bf 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1084,7 +1084,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1109,7 +1108,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1212,7 +1210,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
-
         # different to other models, Bart automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1424,7 +1421,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index 01b2bf8ecec144..ac292cc77707db 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -19,11 +19,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -916,7 +915,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1472,7 +1471,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -1748,7 +1747,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         config.is_decoder = True
         config.is_encoder_decoder = False
@@ -1920,7 +1919,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         outputs = self.model(
             input_ids,
             attention_mask,
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 44b230be0eb9b0..6e29434c4df158 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -803,7 +803,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1109,9 +1108,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
-
         # different to other models, Bart automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1185,7 +1183,6 @@ def call(
     BART_START_DOCSTRING,
 )
 class TFBartModel(TFBartPretrainedModel):
-
     _requires_load_weight_prefix = True
 
     def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs):
@@ -1225,9 +1222,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1442,9 +1438,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py
index a6c5fdc6811549..5dc93578109f8f 100644
--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -192,7 +192,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py
index 26c546be001a24..6d6e29986be4fa 100644
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -166,7 +166,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py
index df1a006d3154e8..da0f2a93bdce86 100644
--- a/src/transformers/models/barthez/__init__.py
+++ b/src/transformers/models/barthez/__init__.py
@@ -41,7 +41,6 @@
 
 
 if TYPE_CHECKING:
-
     try:
         if not is_sentencepiece_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 2e58db113e151e..77ab8a9d64166b 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -134,7 +134,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index a7f36e007c146f..f53a5acd712c71 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -127,7 +127,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py
index b12a962eae3e23..1c1ef0b8675b8a 100644
--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -132,7 +132,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index c44c59942f03bc..ef7bf22b9189cf 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -147,7 +147,7 @@ def __init__(
         auxiliary_num_convs=1,
         auxiliary_concat_input=False,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -186,7 +186,6 @@ def __init__(
 
 # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
 class BeitOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
index 12da57ea386c5e..f80d52db7d8893 100644
--- a/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
+++ b/src/transformers/models/beit/convert_beit_unilm_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
 from datasets import load_dataset
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     BeitConfig,
     BeitFeatureExtractor,
diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py
index 4fa8b0aab0e890..6f73679a71baf6 100644
--- a/src/transformers/models/beit/image_processing_beit.py
+++ b/src/transformers/models/beit/image_processing_beit.py
@@ -105,7 +105,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -157,7 +157,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
@@ -184,7 +184,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
@@ -206,7 +206,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -227,7 +227,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 803f602f92a2f7..4caf0f478fb6db 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -149,7 +149,6 @@ def __init__(self, config: BeitConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
 
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 8e9f87b29448ed..02fb2e5e338dfa 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -16,12 +16,11 @@
 
 from typing import Callable, List, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -166,7 +165,6 @@ def __call__(self, inputs, deterministic: Optional[bool] = True):
 
 
 class FlaxBeitPatchEmbeddings(nn.Module):
-
     config: BeitConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
 
@@ -217,7 +215,6 @@ def setup(self):
         self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
 
     def __call__(self, pixel_values, bool_masked_pos=None, deterministic=True):
-
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.shape
 
@@ -518,7 +515,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
 
@@ -605,7 +601,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         if input_shape is None:
@@ -713,7 +709,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         hidden_states = self.embeddings(pixel_values, bool_masked_pos, deterministic=deterministic)
 
         outputs = self.encoder(
diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py
index b2d64b7fde67da..589c2b0261854b 100644
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@@ -156,7 +156,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
index a58240c8c3c2f7..68ed9bafc873ac 100644
--- a/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
+++ b/src/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py
@@ -26,7 +26,6 @@
 
 
 def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):
-
     """
     Args:
         model: BertModel Pytorch model instance to be converted
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 61355216653cb0..eb0e0d21665034 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -584,7 +584,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1158,7 +1157,6 @@ def forward(
     """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING
 )
 class BertLMHeadModel(BertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
 
@@ -1299,7 +1297,6 @@ def _reorder_cache(self, past, beam_idx):
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"]
 
@@ -1713,7 +1710,6 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForTokenClassification(BertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
@@ -1799,7 +1795,6 @@ def forward(
     BERT_START_DOCSTRING,
 )
 class BertForQuestionAnswering(BertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 818a3ee0896452..6e8eb829b90903 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -15,12 +15,11 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -775,7 +774,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(
             config=config,
@@ -1059,7 +1058,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.bert(
             input_ids,
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 834b6237d36e4a..5391d71a916c3b 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -759,7 +759,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index d398fc5154aec3..8d13bb4e546c22 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -194,7 +194,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -242,7 +242,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py
index b057f7e4ce797c..e55f3f36ad6dd3 100644
--- a/src/transformers/models/bert/tokenization_bert_fast.py
+++ b/src/transformers/models/bert/tokenization_bert_fast.py
@@ -216,7 +216,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py
index e7ef0b411d2ec0..31171cdab112e5 100644
--- a/src/transformers/models/bert/tokenization_bert_tf.py
+++ b/src/transformers/models/bert/tokenization_bert_tf.py
@@ -2,7 +2,6 @@
 from typing import List, Union
 
 import tensorflow as tf
-
 from tensorflow_text import BertTokenizer as BertTokenizerLayer
 from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs
 
diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py
index d602de22f044fe..f0cb795d93615f 100644
--- a/src/transformers/models/bert_generation/configuration_bert_generation.py
+++ b/src/transformers/models/bert_generation/configuration_bert_generation.py
@@ -100,7 +100,7 @@ def __init__(
         eos_token_id=1,
         position_embedding_type="absolute",
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 54044195e961ad..928cd4433e1ef5 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -394,7 +394,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -464,7 +463,6 @@ def load_tf_weights_in_bert_generation(
     try:
         import numpy as np
         import tensorflow.compat.v1 as tf
-
         import tensorflow_hub as hub
         import tensorflow_text  # noqa: F401
 
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index 711dcdf50c2589..6ef3321277f365 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -92,7 +92,7 @@ def __init__(
         pad_token="<pad>",
         sep_token="<::::>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 27d66ae9a990c7..5af9984bb9e9b0 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -158,7 +158,7 @@ def __init__(
         mecab_kwargs=None,
         sudachi_kwargs=None,
         jumanpp_kwargs=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             spm_file=spm_file,
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 50de2db480fa30..837fea136743d2 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -132,7 +132,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             normalization=normalization,
diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py
index d9bcbfef081b53..53bf1ee6f44b75 100644
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@@ -131,7 +131,7 @@ def __init__(
         block_size=64,
         num_random_blocks=3,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
index 614443d81a5e79..34db9771b1e734 100644
--- a/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py
@@ -43,7 +43,6 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, p
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index ff5eeac628d4c2..cdb9e787b791e3 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -534,7 +534,6 @@ def bigbird_block_sparse_attention(
         plan_num_rand_blocks,
         output_attentions,
     ):
-
         # BigBird block-sparse attention as suggested in paper
 
         # ITC:
@@ -1606,7 +1605,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1632,7 +1630,6 @@ def custom_forward(*inputs):
                     blocked_encoder_mask,
                 )
             else:
-
                 layer_outputs = layer_module(
                     hidden_states,
                     attention_mask,
@@ -2178,7 +2175,6 @@ def forward(
 
     @staticmethod
     def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
-
         batch_size, seq_length = attention_mask.size()
         if seq_length % block_size != 0:
             raise ValueError(
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index 2dfa871b118191..2c3806c754e9cd 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -15,12 +15,11 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -481,7 +480,6 @@ def __call__(
 
     @staticmethod
     def create_masks_for_block_sparse_attn(attention_mask, block_size: int):
-
         batch_size, seq_length = attention_mask.shape
         if seq_length % block_size != 0:
             raise ValueError(
@@ -1578,7 +1576,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         if config.attention_type == "block_sparse" and input_shape is None:
@@ -1862,7 +1860,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.bert(
             input_ids,
@@ -2181,7 +2178,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if config.attention_type == "block_sparse" and input_shape is None:
             input_shape = (1, 1, 12 * config.block_size)
@@ -2328,7 +2325,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         # Model
         outputs = self.bert(
             input_ids,
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 47c00fa7c2faf9..bd6f90ef027acd 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -113,7 +113,7 @@ def __init__(
         mask_token="[MASK]",
         cls_token="[CLS]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -206,7 +206,7 @@ def _decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         spaces_between_special_tokens: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
index 6ff063e772e2da..11c3386794701d 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
@@ -124,7 +124,7 @@ def __init__(
         sep_token="[SEP]",
         mask_token="[MASK]",
         cls_token="[CLS]",
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
index ed9c0a42e07699..a7f198a735b385 100644
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@@ -158,7 +158,7 @@ def __init__(
         block_size=64,
         num_random_blocks=3,
         use_bias=False,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
index 2d2efdec77418e..5a81207548f9a2 100644
--- a/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
+++ b/src/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py
@@ -87,7 +87,6 @@ def rename_state_dict_key(k, patterns):
 
 
 def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration:
-
     cfg = BigBirdPegasusConfig(**config_update)
     torch_model = BigBirdPegasusForConditionalGeneration(cfg)
     state_dict = torch_model.state_dict()
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index ee784c9e8a6f4e..21a4c7ade40b7a 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -352,7 +352,6 @@ def bigbird_block_sparse_attention(
         plan_num_rand_blocks,
         output_attentions,
     ):
-
         # BigBirdPegasus block-sparse attention as suggested in paper
 
         # ITC:
@@ -1998,7 +1997,6 @@ def set_attention_type(self, value: str):
 
     @staticmethod  # Copied from transformers.models.big_bird.modeling_big_bird.BigBirdModel.create_masks_for_block_sparse_attn
     def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
-
         batch_size, seq_length = attention_mask.size()
         if seq_length % block_size != 0:
             raise ValueError(
@@ -2266,7 +2264,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -2291,7 +2288,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -2397,7 +2393,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
-
         # different to other models, BigBirdPegasus automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -2611,7 +2606,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
diff --git a/src/transformers/models/biogpt/configuration_biogpt.py b/src/transformers/models/biogpt/configuration_biogpt.py
index 56ced88dc72d88..2fe46354d291e8 100644
--- a/src/transformers/models/biogpt/configuration_biogpt.py
+++ b/src/transformers/models/biogpt/configuration_biogpt.py
@@ -114,7 +114,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
index bcbda452a3258d..c930a850462c82 100755
--- a/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py
@@ -157,7 +157,6 @@ def rewrite_dict_keys(d):
 
 
 def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path):
-
     # prep
     if not os.path.exists(biogpt_checkpoint_path):
         raise ValueError(f"path {biogpt_checkpoint_path} does not exist!")
diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py
index 716db23c68ac42..3fd9c823f968d3 100755
--- a/src/transformers/models/biogpt/modeling_biogpt.py
+++ b/src/transformers/models/biogpt/modeling_biogpt.py
@@ -556,7 +556,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -578,7 +577,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -704,7 +702,6 @@ def forward(
         )
 
     def prepare_inputs_for_generation(self, input_ids, attention_mask, past_key_values=None, **kwargs):
-
         # only last token for inputs_ids if past is defined in kwargs
         if past_key_values:
             input_ids = input_ids[:, -1].unsqueeze(-1)
diff --git a/src/transformers/models/biogpt/tokenization_biogpt.py b/src/transformers/models/biogpt/tokenization_biogpt.py
index 405e4c8625133a..55f337f2ec9132 100644
--- a/src/transformers/models/biogpt/tokenization_biogpt.py
+++ b/src/transformers/models/biogpt/tokenization_biogpt.py
@@ -110,7 +110,7 @@ def __init__(
         eos_token="</s>",
         sep_token="</s>",
         pad_token="<pad>",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             bos_token=bos_token,
diff --git a/src/transformers/models/bit/configuration_bit.py b/src/transformers/models/bit/configuration_bit.py
index 7c1e105107e34c..278c7f1c7f1a5a 100644
--- a/src/transformers/models/bit/configuration_bit.py
+++ b/src/transformers/models/bit/configuration_bit.py
@@ -98,7 +98,7 @@ def __init__(
         output_stride=32,
         width_factor=1,
         out_features=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
diff --git a/src/transformers/models/bit/convert_bit_to_pytorch.py b/src/transformers/models/bit/convert_bit_to_pytorch.py
index 106c67d17e5e23..7cc7f64107ce9e 100644
--- a/src/transformers/models/bit/convert_bit_to_pytorch.py
+++ b/src/transformers/models/bit/convert_bit_to_pytorch.py
@@ -19,14 +19,14 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
 from timm import create_model
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+
 from transformers import BitConfig, BitForImageClassification, BitImageProcessor
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index 4395d0558455e9..374f57af4cd481 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -102,7 +102,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -128,7 +128,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -155,7 +155,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
@@ -179,7 +179,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -200,7 +200,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
@@ -233,7 +233,7 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index ef18a3cec2e537..93ee9281364526 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -135,7 +135,7 @@ def __init__(
         eos_token_id=2,
         encoder_no_repeat_ngram_size=3,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 1f2b3a878d836c..22c647d67c8a56 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1015,7 +1015,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1040,7 +1039,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1384,7 +1382,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index baadfd973e69fd..629ddb99a80c5c 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -19,11 +19,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -889,7 +888,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1448,7 +1447,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
diff --git a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
index 23693b24bd6941..6b95bd56739adc 100644
--- a/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_tf_blenderbot.py
@@ -785,7 +785,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1101,7 +1100,7 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1219,7 +1218,7 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
         outputs = self.model(
             input_ids=input_ids,
@@ -1456,9 +1455,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index ace4afc6d503c6..208ced46bc2db8 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -185,7 +185,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
index 4e65294cb9041d..7c4e060e5d2035 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -148,7 +148,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index c056fa46bd1f07..fbc23435d66f31 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -134,7 +134,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 633675dc56720d..0f5e5a46c95397 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1011,7 +1011,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1036,7 +1035,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1350,7 +1348,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index 78947481faa5f3..226e401c921ea5 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -20,11 +20,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -887,7 +886,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1446,7 +1445,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
diff --git a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
index 0ef4e1e8beef5c..3d521ea77a4d67 100644
--- a/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py
@@ -792,7 +792,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1109,9 +1108,8 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
-
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -1212,9 +1210,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1435,9 +1432,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 6ef94e636aebb0..a0b45bff1dc78c 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -104,7 +104,7 @@ def __init__(
         eos_token="__end__",
         unk_token="__unk__",
         pad_token="__null__",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
 
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
index 8dfae5894fa6c3..adc350f3d11132 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -72,7 +72,7 @@ def __init__(
         eos_token="<|endoftext|>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             ByteLevelBPETokenizer(
diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py
index 04d88570e82a72..8bdff88bff2fe8 100644
--- a/src/transformers/models/blip/configuration_blip.py
+++ b/src/transformers/models/blip/configuration_blip.py
@@ -133,7 +133,7 @@ def __init__(
         sep_token_id=102,
         is_decoder=True,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
@@ -161,7 +161,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from BlipConfig
@@ -242,7 +241,7 @@ def __init__(
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=1e-10,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -261,7 +260,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from BlipConfig
@@ -334,7 +332,7 @@ def __init__(
         projection_dim=512,
         logit_scale_init_value=2.6592,
         image_text_hidden_size=256,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
index 9deda9c11609fa..7609b4a40e857f 100644
--- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
+++ b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py
@@ -16,17 +16,17 @@
 import argparse
 import re
 
-import torch
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms.functional import InterpolationMode
-
 import requests
+import torch
 
 # git clone https://github.com/salesforce/BLIP.git
 from models.blip import blip_decoder
 from models.blip_itm import blip_itm
 from models.blip_vqa import blip_vqa
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
 from transformers import (
     BertTokenizer,
     BlipConfig,
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index e9b49924ec93ec..1b7e4c6b1a6a9f 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -91,9 +91,8 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
-
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 384, "width": 384}
         size = get_size_dict(size, default_to_square=True)
@@ -114,7 +113,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -142,7 +141,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -163,7 +162,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py
index 96e3f5f4e1ad59..7f1b3412b6845e 100644
--- a/src/transformers/models/blip/modeling_blip.py
+++ b/src/transformers/models/blip/modeling_blip.py
@@ -1036,7 +1036,7 @@ def generate(
         pixel_values: torch.FloatTensor,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
-        **generate_kwargs
+        **generate_kwargs,
     ) -> torch.LongTensor:
         r"""
         Overrides *generate* function to be able to use the model as a conditional generator
@@ -1263,7 +1263,7 @@ def generate(
         input_ids: torch.LongTensor,
         pixel_values: torch.FloatTensor,
         attention_mask: Optional[torch.LongTensor] = None,
-        **generate_kwargs
+        **generate_kwargs,
     ) -> torch.LongTensor:
         r"""
         Overrides *generate* function to be able to use the model as a conditional generator
diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py
index 7085fc2f11e147..c44cf3b0dfebeb 100644
--- a/src/transformers/models/blip/modeling_blip_text.py
+++ b/src/transformers/models/blip/modeling_blip_text.py
@@ -409,7 +409,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warn(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -802,7 +801,6 @@ def forward(
 
 # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
 class BlipTextLMHeadModel(BlipTextPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index e860f6723a2611..5a9967913e4486 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -63,7 +63,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
@@ -76,7 +76,6 @@ def __call__(
 
         # Get only text
         if images is None:
-
             self.current_processor = self.tokenizer
             text_encoding = self.tokenizer(
                 text=text,
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 7eb4eb5322260b..74443d3e6eec3a 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -686,7 +686,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
         if deprecated_arguments.pop("position_ids", False) is not False:
             # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
@@ -752,12 +752,10 @@ def forward(
         )
 
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -843,7 +841,7 @@ def prepare_inputs_for_generation(
         past_key_values: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs
+        **kwargs,
     ) -> dict:
         # only last token for input_ids if past is not None
         if past_key_values:
@@ -886,7 +884,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1016,7 +1014,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1152,7 +1150,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **deprecated_arguments
+        **deprecated_arguments,
     ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py
index 1d6f405039a80b..1c8efb10cb6c46 100644
--- a/src/transformers/models/bloom/tokenization_bloom_fast.py
+++ b/src/transformers/models/bloom/tokenization_bloom_fast.py
@@ -113,7 +113,7 @@ def __init__(
         eos_token="</s>",
         pad_token="<pad>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
index 933c88795ab94a..4753f593da19b2 100644
--- a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
+++ b/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py
@@ -18,17 +18,17 @@
 import argparse
 import os
 
-import numpy as np
-import torch
-from packaging import version
-from torch import nn
-
 import gluonnlp as nlp
 import mxnet as mx
+import numpy as np
+import torch
 from gluonnlp.base import get_home_dir
 from gluonnlp.model.bert import BERTEncoder
 from gluonnlp.model.utils import _load_vocab
 from gluonnlp.vocab import Vocab
+from packaging import version
+from torch import nn
+
 from transformers import BertConfig, BertForMaskedLM, BertModel, RobertaTokenizer
 from transformers.models.bert.modeling_bert import (
     BertIntermediate,
diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py
index a88767a2172849..c04b01a2ea713c 100644
--- a/src/transformers/models/bridgetower/configuration_bridgetower.py
+++ b/src/transformers/models/bridgetower/configuration_bridgetower.py
@@ -88,7 +88,7 @@ def __init__(
         stop_gradient=False,
         share_layernorm=True,
         remove_last_layer=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.hidden_size = hidden_size
@@ -207,7 +207,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -317,7 +317,7 @@ def __init__(
         init_layernorm_from_vision_encoder=False,
         text_config=None,
         vision_config=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
index 8e8be8373c7ae4..fdea55396ba446 100644
--- a/src/transformers/models/bridgetower/image_processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -170,7 +170,7 @@ def __init__(
         image_std: Optional[Union[float, List[float]]] = None,
         do_center_crop: bool = True,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -199,7 +199,7 @@ def resize(
         size_divisor: int = 32,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -234,7 +234,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -254,7 +254,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
@@ -278,7 +278,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py
index f520a221f84a51..aa25ad52d7edf8 100644
--- a/src/transformers/models/bridgetower/modeling_bridgetower.py
+++ b/src/transformers/models/bridgetower/modeling_bridgetower.py
@@ -768,7 +768,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index a92932c041e0ba..c268d7c26f43d9 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -63,7 +63,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and
diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py
index 0071d7a9afe4ed..59e694c343c559 100644
--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -67,7 +67,7 @@ def __init__(
         pad_token="<pad>",
         extra_ids=125,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Add extra_ids to the special token list
         if extra_ids > 0 and additional_special_tokens is None:
diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py
index 09989f1cb85f4e..d712726492ae18 100644
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -126,7 +126,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
index e7755dd0305324..2a7b4ecbfac16f 100644
--- a/src/transformers/models/camembert/modeling_camembert.py
+++ b/src/transformers/models/camembert/modeling_camembert.py
@@ -515,7 +515,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/camembert/modeling_tf_camembert.py b/src/transformers/models/camembert/modeling_tf_camembert.py
index 29f435bdb194b2..5142b3d82b04cb 100644
--- a/src/transformers/models/camembert/modeling_tf_camembert.py
+++ b/src/transformers/models/camembert/modeling_tf_camembert.py
@@ -729,7 +729,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index f5988fd9d784ec..658dd1080b7122 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -129,7 +129,7 @@ def __init__(
         mask_token="<mask>",
         additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index cbffa75a28b744..8a5ebbedd1c7be 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -120,7 +120,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/canine/configuration_canine.py b/src/transformers/models/canine/configuration_canine.py
index b75ab9cc42b9a0..1fdeb3204a52e4 100644
--- a/src/transformers/models/canine/configuration_canine.py
+++ b/src/transformers/models/canine/configuration_canine.py
@@ -112,7 +112,7 @@ def __init__(
         num_hash_functions=8,
         num_hash_buckets=16384,
         local_transformer_stride=128,  # Good TPU/XLA memory alignment.
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
index 15b7b6c32ae515..5d50050d039687 100644
--- a/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py
@@ -25,7 +25,6 @@
 
 
 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path):
-
     # Initialize PyTorch model
     config = CanineConfig()
     model = CanineModel(config)
diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py
index 9cabe705b4486a..a91d42f0395ee8 100644
--- a/src/transformers/models/canine/modeling_canine.py
+++ b/src/transformers/models/canine/modeling_canine.py
@@ -312,7 +312,6 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
     def forward(self, char_encoding: torch.Tensor) -> torch.Tensor:
-
         # `cls_encoding`: [batch, 1, hidden_size]
         cls_encoding = char_encoding[:, 0:1, :]
 
diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py
index fba01e03c0e871..2fae9e1482bd32 100644
--- a/src/transformers/models/canine/tokenization_canine.py
+++ b/src/transformers/models/canine/tokenization_canine.py
@@ -86,7 +86,7 @@ def __init__(
         mask_token=chr(MASK),
         add_prefix_space=False,
         model_max_length=2048,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
index 9f581a087a0d38..f20e16e41cac64 100644
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@@ -122,7 +122,7 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
@@ -144,7 +144,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from ChineseCLIPConfig
@@ -228,7 +227,7 @@ def __init__(
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -248,7 +247,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from ChineseCLIPConfig
@@ -402,7 +400,6 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
diff --git a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
index 6016c51b376ca8..02c4b7b754b295 100644
--- a/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/chinese_clip/convert_chinese_clip_original_pytorch_to_hf.py
@@ -21,7 +21,6 @@
 
 
 def copy_attn_layer(hf_attn_layer, pt_weights, prefix):
-
     q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0)
     q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0)
 
diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
index 848bc086a1f086..18a80e21af6cfc 100644
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@@ -102,7 +102,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -128,7 +128,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -155,7 +155,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
@@ -177,7 +177,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -198,7 +198,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
@@ -231,7 +231,7 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
index 527a8a09113cc5..ce6a283a05b3ed 100644
--- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py
@@ -900,7 +900,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 84dec5d71a58c7..1295245993110b 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -108,7 +108,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -127,7 +127,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from CLIPConfig
@@ -211,7 +210,7 @@ def __init__(
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -231,7 +230,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from CLIPConfig
@@ -383,7 +381,6 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
index 58886aa88a3440..0033be274d5c13 100644
--- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py
@@ -16,8 +16,8 @@
 import argparse
 
 import torch
-
 from clip import load
+
 from transformers import CLIPConfig, CLIPModel
 
 
diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py
index ac99feb54dd9ae..325f1c3991817d 100644
--- a/src/transformers/models/clip/image_processing_clip.py
+++ b/src/transformers/models/clip/image_processing_clip.py
@@ -102,7 +102,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -128,7 +128,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -155,7 +155,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
@@ -179,7 +179,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -200,7 +200,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
@@ -233,7 +233,7 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index ea4ff88a2c6b23..cb8ee4e7c9a444 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -593,7 +593,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -673,7 +673,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = (1, config.image_size, config.image_size, 3)
@@ -744,7 +744,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = ((1, 1), (1, config.vision_config.image_size, config.vision_config.image_size, 3))
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index 9ea7e7690428a8..680ea91ca1383a 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -151,7 +151,6 @@ def __init__(self, config: CLIPVisionConfig, **kwargs):
         )
 
     def build(self, input_shape: tf.TensorShape):
-
         factor = self.config.initializer_factor
 
         self.class_embedding = self.add_weight(
@@ -205,7 +204,6 @@ def __init__(self, config: CLIPTextConfig, **kwargs):
         self.config = config
 
     def build(self, input_shape: tf.TensorShape):
-
         with tf.name_scope("token_embedding"):
             self.weight = self.add_weight(
                 shape=(self.config.vocab_size, self.embed_dim),
@@ -381,7 +379,6 @@ def __init__(self, config: CLIPConfig, **kwargs):
         )
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
-
         hidden_states = self.fc1(inputs=hidden_states)
         hidden_states = self.activation_fn(hidden_states)
         hidden_states = self.fc2(inputs=hidden_states)
@@ -644,7 +641,6 @@ def call(
         return_dict: bool,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         embedding_output = self.embeddings(pixel_values=pixel_values)
         embedding_output = self.pre_layernorm(inputs=embedding_output)
 
@@ -694,7 +690,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -753,7 +748,6 @@ def __init__(self, config: CLIPConfig, **kwargs):
         )
 
     def build(self, input_shape: tf.TensorShape):
-
         self.logit_scale = self.add_weight(
             shape=(1,),
             initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
@@ -774,7 +768,6 @@ def get_text_features(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> tf.Tensor:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
 
@@ -836,7 +829,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFCLIPOutput, Tuple[tf.Tensor]]:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
         if pixel_values is None:
diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py
index ef8da45cda2afd..e3ff5f8626fa6f 100644
--- a/src/transformers/models/clip/tokenization_clip.py
+++ b/src/transformers/models/clip/tokenization_clip.py
@@ -295,7 +295,7 @@ def __init__(
         bos_token="<|startoftext|>",
         eos_token="<|endoftext|>",
         pad_token="<|endoftext|>",  # hack to enable padding
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py
index df11bf793f0871..75b3e4f4078053 100644
--- a/src/transformers/models/clip/tokenization_clip_fast.py
+++ b/src/transformers/models/clip/tokenization_clip_fast.py
@@ -84,7 +84,7 @@ def __init__(
         bos_token="<|startoftext|>",
         eos_token="<|endoftext|>",
         pad_token="<|endoftext|>",  # hack to enable padding
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py
index 33a9330e1f4f48..1910c946325ae4 100644
--- a/src/transformers/models/clipseg/configuration_clipseg.py
+++ b/src/transformers/models/clipseg/configuration_clipseg.py
@@ -99,7 +99,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -117,7 +117,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from CLIPSegConfig
@@ -200,7 +199,7 @@ def __init__(
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -219,7 +218,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from CLIPSegConfig
@@ -316,7 +314,7 @@ def __init__(
         decoder_intermediate_size=2048,
         conditional_layer=0,
         use_complex_transposed_convolution=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
index 778dbca2996782..183bb93b9e2b75 100644
--- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
+++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py
@@ -17,10 +17,10 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import (
     CLIPSegConfig,
     CLIPSegForImageSegmentation,
diff --git a/src/transformers/models/codegen/configuration_codegen.py b/src/transformers/models/codegen/configuration_codegen.py
index aed1249f86b0b1..1a1e609f0111fb 100644
--- a/src/transformers/models/codegen/configuration_codegen.py
+++ b/src/transformers/models/codegen/configuration_codegen.py
@@ -124,7 +124,7 @@ def __init__(
         bos_token_id=50256,
         eos_token_id=50256,
         tie_word_embeddings=False,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_ctx = n_ctx
diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
index 92a7d9506cd9a2..87a2a986c82055 100644
--- a/src/transformers/models/codegen/modeling_codegen.py
+++ b/src/transformers/models/codegen/modeling_codegen.py
@@ -147,7 +147,6 @@ def _attn(
         attention_mask=None,
         head_mask=None,
     ):
-
         # compute causal mask from causal mask buffer
         query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]
@@ -193,7 +192,6 @@ def forward(
         Tuple[torch.Tensor, Tuple[torch.Tensor]],
         Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
     ]:
-
         qkv = self.qkv_proj(hidden_states)
         # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
         mp_num = 4
@@ -545,12 +543,10 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py
index ff86eee8231ab6..c09a816bfbab5c 100644
--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@@ -21,7 +21,6 @@
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import numpy as np
-
 import regex as re
 
 from ...utils import is_tf_available, is_torch_available, logging
@@ -160,7 +159,7 @@ def __init__(
         pad_token=None,
         add_prefix_space=False,
         add_bos_token=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -321,7 +320,7 @@ def decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         truncate_before_pattern: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py
index 1c3dcf85fb5ea1..332f0ed934acad 100644
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@@ -126,7 +126,7 @@ def __init__(
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -187,7 +187,7 @@ def decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         truncate_before_pattern: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
diff --git a/src/transformers/models/conditional_detr/configuration_conditional_detr.py b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
index 08fd5f08357153..ec04f0a52369fa 100644
--- a/src/transformers/models/conditional_detr/configuration_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/configuration_conditional_detr.py
@@ -178,7 +178,7 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         focal_alpha=0.25,
-        **kwargs
+        **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
@@ -240,7 +240,6 @@ def hidden_size(self) -> int:
 
 
 class ConditionalDetrOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
index a4e28cbb558a30..083b3c681ec22d 100644
--- a/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/conditional_detr/convert_conditional_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -20,11 +20,11 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     ConditionalDetrConfig,
     ConditionalDetrFeatureExtractor,
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index 4f161bbb6ec58b..d4e2f9dd5f3ac2 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -788,7 +788,7 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -901,7 +901,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
@@ -1089,7 +1089,7 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index bf3fb9994727d0..cbee21aafe3a52 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -133,7 +133,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -181,7 +181,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py
index 65c37a9b0927e8..07447bb6a17caa 100644
--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ b/src/transformers/models/convbert/tokenization_convbert_fast.py
@@ -110,7 +110,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py
index 41562cbcd44efa..d4807bc5741a02 100644
--- a/src/transformers/models/convnext/configuration_convnext.py
+++ b/src/transformers/models/convnext/configuration_convnext.py
@@ -97,7 +97,7 @@ def __init__(
         drop_path_rate=0.0,
         image_size=224,
         out_features=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -125,7 +125,6 @@ def __init__(
 
 
 class ConvNextOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/convnext/convert_convnext_to_pytorch.py b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
index e40565c7a691cd..69c300eee40406 100644
--- a/src/transformers/models/convnext/convert_convnext_to_pytorch.py
+++ b/src/transformers/models/convnext/convert_convnext_to_pytorch.py
@@ -21,11 +21,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import ConvNextConfig, ConvNextFeatureExtractor, ConvNextForImageClassification
 from transformers.utils import logging
 
diff --git a/src/transformers/models/convnext/image_processing_convnext.py b/src/transformers/models/convnext/image_processing_convnext.py
index 2353767df53ab4..a46bdcfef75b77 100644
--- a/src/transformers/models/convnext/image_processing_convnext.py
+++ b/src/transformers/models/convnext/image_processing_convnext.py
@@ -99,7 +99,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 384}
@@ -123,7 +123,7 @@ def resize(
         crop_pct: float,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -166,7 +166,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -187,7 +187,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index bf2ec8f7c4511f..f509519271d4e8 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -53,7 +53,7 @@ def __init__(
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         """
         Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py
index 032aebcf5b1ede..31a2aaa9f1d848 100644
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -53,7 +53,7 @@ def __init__(
         cls_token="<cls>",
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
-        **kwargs
+        **kwargs,
     ):
         """
         Construct a CPM tokenizer. Based on [Jieba](https://pypi.org/project/jieba/) and
diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py
index d207c3fa4b2fa2..0a1feed58b24db 100644
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -97,7 +97,7 @@ def __init__(
         layer_norm_epsilon=1e-6,
         initializer_range=0.02,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
index 116bb4ca665bdd..dcd3f5a03e0c92 100644
--- a/src/transformers/models/ctrl/modeling_tf_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -269,7 +269,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFBaseModelOutputWithPast]:
-
         # If using past key value states, only the last tokens
         # should be given as an input
         if past_key_values is not None:
diff --git a/src/transformers/models/cvt/configuration_cvt.py b/src/transformers/models/cvt/configuration_cvt.py
index 0ab32857d49608..a540c0f4807cca 100644
--- a/src/transformers/models/cvt/configuration_cvt.py
+++ b/src/transformers/models/cvt/configuration_cvt.py
@@ -121,7 +121,7 @@ def __init__(
         stride_q=[1, 1, 1],
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.num_channels = num_channels
diff --git a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
index 1c9f58f4a68d5d..e84c61d6aad644 100644
--- a/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/cvt/convert_cvt_original_pytorch_checkpoint_to_pytorch.py
@@ -22,8 +22,8 @@
 from collections import OrderedDict
 
 import torch
-
 from huggingface_hub import cached_download, hf_hub_url
+
 from transformers import AutoFeatureExtractor, CvtConfig, CvtForImageClassification
 
 
diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py
index 220e40396c9b8b..99e3a02febf4d2 100644
--- a/src/transformers/models/cvt/modeling_cvt.py
+++ b/src/transformers/models/cvt/modeling_cvt.py
@@ -212,7 +212,7 @@ def __init__(
         qkv_bias,
         attention_drop_rate,
         with_cls_token=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
         self.scale = embed_dim**-0.5
@@ -616,7 +616,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithCLSToken]:
-
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
diff --git a/src/transformers/models/cvt/modeling_tf_cvt.py b/src/transformers/models/cvt/modeling_tf_cvt.py
index c82cade5247982..52cc6585a7a552 100644
--- a/src/transformers/models/cvt/modeling_tf_cvt.py
+++ b/src/transformers/models/cvt/modeling_tf_cvt.py
@@ -109,7 +109,7 @@ def __init__(
         stride: int,
         padding: int,
         dropout_rate: float,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.convolution_embeddings = TFCvtConvEmbeddings(
@@ -211,7 +211,7 @@ def __init__(
         stride: int,
         padding: int,
         projection_method: str = "dw_bn",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if projection_method == "dw_bn":
@@ -246,7 +246,7 @@ def __init__(
         qkv_bias: bool,
         attention_drop_rate: float,
         with_cls_token: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.scale = embed_dim**-0.5
@@ -470,7 +470,7 @@ def __init__(
         mlp_ratio: float,
         drop_path_rate: float,
         with_cls_token: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.attention = TFCvtAttention(
diff --git a/src/transformers/models/data2vec/configuration_data2vec_audio.py b/src/transformers/models/data2vec/configuration_data2vec_audio.py
index c1aee468321482..2ec526924f36eb 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_audio.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_audio.py
@@ -211,7 +211,7 @@ def __init__(
         adapter_stride=2,
         num_adapter_layers=3,
         output_hidden_size=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py
index a990e933bc6762..305a3ea5e4ffa4 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_text.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_text.py
@@ -117,7 +117,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/data2vec/configuration_data2vec_vision.py b/src/transformers/models/data2vec/configuration_data2vec_vision.py
index a63c9429c3382f..b45f8420ca0008 100644
--- a/src/transformers/models/data2vec/configuration_data2vec_vision.py
+++ b/src/transformers/models/data2vec/configuration_data2vec_vision.py
@@ -142,7 +142,7 @@ def __init__(
         auxiliary_num_convs=1,
         auxiliary_concat_input=False,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -180,7 +180,6 @@ def __init__(
 
 # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
 class Data2VecVisionOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
index 7777e85927cdaa..8ff8c1b910f0b4 100755
--- a/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/data2vec/convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py
@@ -3,10 +3,10 @@
 import json
 
 import torch
-from PIL import Image
-
 from huggingface_hub import hf_hub_download
+from PIL import Image
 from timm.models import create_model
+
 from transformers import (
     BeitFeatureExtractor,
     Data2VecVisionConfig,
diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
index 8ad1c9f0e9fb77..6cda1b869bca68 100755
--- a/src/transformers/models/data2vec/modeling_data2vec_audio.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -1049,7 +1049,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
index b3c5cba4f226c1..737209142cea40 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_text.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -501,7 +501,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index c1b85c7f093265..42a1edcb6493ca 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -150,7 +150,6 @@ def __init__(self, config: Data2VecVisionConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
 
diff --git a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
index cfb81ceb396e05..c837670b1a1f69 100644
--- a/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_tf_data2vec_vision.py
@@ -165,7 +165,6 @@ def build(self, input_shape: tf.TensorShape):
         super().build(input_shape)
 
     def call(self, pixel_values: tf.Tensor, bool_masked_pos: Optional[tf.Tensor] = None) -> tf.Tensor:
-
         embeddings = self.patch_embeddings(pixel_values)
         batch_size, seq_len, projection_dim = shape_list(embeddings)
 
@@ -909,7 +908,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[tuple, TFData2VecVisionModelOutputWithPooling]:
-
         outputs = self.data2vec_vision(
             pixel_values=pixel_values,
             bool_masked_pos=bool_masked_pos,
@@ -1027,7 +1025,7 @@ def __init__(
         padding: str = "valid",
         bias: bool = False,
         dilation: Union[int, Tuple[int, int]] = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.conv = tf.keras.layers.Conv2D(
@@ -1261,7 +1259,7 @@ def __init__(
         in_index: int = 2,
         kernel_size: int = 3,
         dilation: Union[int, Tuple[int, int]] = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.in_channels = config.hidden_size
diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py
index ec00f0eccb2d65..94ea91cd3a0888 100644
--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -128,7 +128,7 @@ def __init__(
         pos_att_type=None,
         pooler_dropout=0,
         pooler_hidden_act="gelu",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index ecfd0c53e6a6d5..2b63b428bbe9b2 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -454,7 +454,6 @@ def forward(
             next_kv = hidden_states
         rel_embeddings = self.get_rel_embedding()
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py
index c86b900042c059..016ce15db61825 100644
--- a/src/transformers/models/deberta/modeling_tf_deberta.py
+++ b/src/transformers/models/deberta/modeling_tf_deberta.py
@@ -92,7 +92,6 @@ def __init__(self, axis=-1, **kwargs):
         self.axis = axis
 
     def call(self, inputs: tf.Tensor, mask: tf.Tensor):
-
         rmask = tf.logical_not(tf.cast(mask, tf.bool))
         output = tf.where(rmask, float("-inf"), inputs)
         output = stable_softmax(output, self.axis)
@@ -352,7 +351,6 @@ def call(
         rel_embeddings = self.get_rel_embedding()
 
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
@@ -659,7 +657,6 @@ def linear(w, b, x):
         return outputs
 
     def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
-
         if relative_pos is None:
             q = shape_list(query_layer)[-2]
             relative_pos = build_relative_position(q, shape_list(key_layer)[-2])
@@ -944,7 +941,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index bbddb00a2682c9..bcaaaa4421178f 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -191,7 +191,7 @@ def __init__(
         mask_token="[MASK]",
         add_prefix_space=False,
         add_bos_token=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
index f708de1636216f..959bcae470112a 100644
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -154,9 +154,8 @@ def __init__(
         pad_token="[PAD]",
         mask_token="[MASK]",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
-
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
index 3e7d0d97fe6f05..d55486cd563381 100644
--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -130,7 +130,7 @@ def __init__(
         pos_att_type=None,
         pooler_dropout=0,
         pooler_hidden_act="gelu",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index 7d73a49ef9281f..ef04a24b2f63e1 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -498,7 +498,6 @@ def forward(
         rel_embeddings = self.get_rel_embedding()
         output_states = next_kv
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (output_states,)
 
diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
index 885212be7389be..015eb392574087 100644
--- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py
@@ -93,7 +93,6 @@ def __init__(self, axis=-1, **kwargs):
         self.axis = axis
 
     def call(self, inputs: tf.Tensor, mask: tf.Tensor):
-
         rmask = tf.logical_not(tf.cast(mask, tf.bool))
         output = tf.where(rmask, float("-inf"), inputs)
         output = stable_softmax(output, self.axis)
@@ -416,7 +415,6 @@ def call(
         rel_embeddings = self.get_rel_embedding()
         output_states = next_kv
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (output_states,)
 
@@ -713,7 +711,6 @@ def call(
         return outputs
 
     def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
-
         if relative_pos is None:
             q = shape_list(query_layer)[-2]
             relative_pos = build_relative_position(
@@ -1036,7 +1033,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index fc259dd7d5eec9..b2a0d844f1625d 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -120,7 +120,7 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
index 32ccd84862fa86..3f2a90cfa83ea9 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
@@ -128,7 +128,7 @@ def __init__(
         pad_token="[PAD]",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/decision_transformer/configuration_decision_transformer.py b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
index 17c81c767c5c24..91ef58665a1f18 100644
--- a/src/transformers/models/decision_transformer/configuration_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/configuration_decision_transformer.py
@@ -134,7 +134,6 @@ def __init__(
         reorder_and_upcast_attn=False,
         **kwargs,
     ):
-
         self.state_dim = state_dim
         self.act_dim = act_dim
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
index 5008c7ed7e9c5b..1f5f7601229d32 100755
--- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py
+++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -288,8 +288,8 @@ def forward(
         if encoder_hidden_states is not None:
             if not hasattr(self, "q_attn"):
                 raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to"
-                    " instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
                 )
 
             query = self.q_attn(hidden_states)
@@ -612,7 +612,6 @@ def forward(
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -628,7 +627,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
index 253d521aa5de7a..d00b71fab811d6 100644
--- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py
@@ -193,7 +193,7 @@ def __init__(
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
         focal_alpha=0.25,
-        **kwargs
+        **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
diff --git a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
index 8e4461d515c2ea..d1fd8bcbe46677 100644
--- a/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
+++ b/src/transformers/models/deformable_detr/convert_deformable_detr_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_url
 from transformers import DeformableDetrConfig, DeformableDetrFeatureExtractor, DeformableDetrForObjectDetection
 from transformers.utils import logging
 
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 5b23c05c085c9d..3601a2aad11f68 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -786,7 +786,7 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -899,7 +899,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
@@ -1087,7 +1087,7 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index c675e793211ce5..a7ee782501a2ab 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1172,7 +1172,6 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
         """
         reference_points_list = []
         for level, (height, width) in enumerate(spatial_shapes):
-
             ref_y, ref_x = meshgrid(
                 torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
                 torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py
index 8fbba3e9be08d7..b395afdbef5cf3 100644
--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -109,7 +109,7 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         encoder_stride=16,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
index 8a8a394c3f8103..a91702106e0f8c 100644
--- a/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
+++ b/src/transformers/models/deit/convert_deit_timm_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
+
 from transformers import DeiTConfig, DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
 from transformers.utils import logging
 
diff --git a/src/transformers/models/deit/image_processing_deit.py b/src/transformers/models/deit/image_processing_deit.py
index 2c0ad59fa57f5b..77d7d2bb2ca2e5 100644
--- a/src/transformers/models/deit/image_processing_deit.py
+++ b/src/transformers/models/deit/image_processing_deit.py
@@ -91,7 +91,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 256, "width": 256}
@@ -116,7 +116,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PIL.Image.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])` using the specified resampling filter.
@@ -143,7 +143,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to `(crop_size["height"], crop_size["width"])`. If the input size is smaller than
@@ -167,7 +167,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -188,7 +188,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index ca849f230e6c5e..f05b16efe7a045 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -204,7 +204,6 @@ def __init__(self, config: DeiTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -262,7 +261,6 @@ def __init__(self, config: DeiTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/deta/configuration_deta.py b/src/transformers/models/deta/configuration_deta.py
index 14e2db1580b61d..06b4d3f892377e 100644
--- a/src/transformers/models/deta/configuration_deta.py
+++ b/src/transformers/models/deta/configuration_deta.py
@@ -174,7 +174,7 @@ def __init__(
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
         focal_alpha=0.25,
-        **kwargs
+        **kwargs,
     ):
         if backbone_config is None:
             logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
diff --git a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
index 84672301034982..cc17568bd64133 100644
--- a/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
+++ b/src/transformers/models/deta/convert_deta_resnet_to_pytorch.py
@@ -21,11 +21,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
 from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor
 from transformers.utils import logging
 
diff --git a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
index 46ed720e24dafd..911bc434e14265 100644
--- a/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
+++ b/src/transformers/models/deta/convert_deta_swin_to_pytorch.py
@@ -21,11 +21,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_download, hf_hub_url
 from transformers import DetaConfig, DetaForObjectDetection, DetaImageProcessor, SwinConfig
 from transformers.utils import logging
 
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index 1ef7700975229b..717fbfdd540a97 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -491,7 +491,7 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -568,7 +568,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
@@ -746,7 +746,7 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 750bf114f5b6b0..eb77604fbfc4ee 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -1069,7 +1069,6 @@ def get_reference_points(spatial_shapes, valid_ratios, device):
         """
         reference_points_list = []
         for level, (height, width) in enumerate(spatial_shapes):
-
             ref_y, ref_x = meshgrid(
                 torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
                 torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index e730c833ff12fe..430efc913b37c3 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -174,7 +174,7 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
-        **kwargs
+        **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
@@ -235,7 +235,6 @@ def hidden_size(self) -> int:
 
 
 class DetrOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
index a03642c316448a..b6dcc617da7b62 100644
--- a/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_original_pytorch_checkpoint_to_pytorch.py
@@ -20,11 +20,11 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import DetrConfig, DetrFeatureExtractor, DetrForObjectDetection, DetrForSegmentation
 from transformers.utils import logging
 
diff --git a/src/transformers/models/detr/convert_detr_to_pytorch.py b/src/transformers/models/detr/convert_detr_to_pytorch.py
index 0ef750f0e4fb2e..3ff2e38ac38325 100644
--- a/src/transformers/models/detr/convert_detr_to_pytorch.py
+++ b/src/transformers/models/detr/convert_detr_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import DetrConfig, DetrForObjectDetection, DetrForSegmentation, DetrImageProcessor, ResNetConfig
 from transformers.utils import logging
 
@@ -60,7 +60,6 @@ def get_detr_config(model_name):
 
 
 def create_rename_keys(config):
-
     # here we list all keys to be renamed (original name on the left, our name on the right)
     rename_keys = []
 
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 4b24ad42c7ef3d..f03465954e8de4 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -770,7 +770,7 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -875,7 +875,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
@@ -1055,7 +1055,7 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
diff --git a/src/transformers/models/dinat/configuration_dinat.py b/src/transformers/models/dinat/configuration_dinat.py
index 8348d1beb9fe9f..e4ba8f940debe4 100644
--- a/src/transformers/models/dinat/configuration_dinat.py
+++ b/src/transformers/models/dinat/configuration_dinat.py
@@ -117,7 +117,7 @@ def __init__(
         layer_norm_eps=1e-5,
         layer_scale_init_value=0.0,
         out_features=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index 7176fae899d0f2..ef19005834ac12 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -337,7 +337,6 @@ def forward(
         hidden_states: torch.Tensor,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-
         query_layer = self.transpose_for_scores(self.query(hidden_states))
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py
index b36917bc07580b..3dabb3d3e2340e 100644
--- a/src/transformers/models/distilbert/configuration_distilbert.py
+++ b/src/transformers/models/distilbert/configuration_distilbert.py
@@ -121,7 +121,7 @@ def __init__(
         qa_dropout=0.1,
         seq_classif_dropout=0.2,
         pad_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 214c5d916621ea..535c7372382dd7 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -329,7 +329,6 @@ def forward(
         output_hidden_states: bool = False,
         return_dict: Optional[bool] = None,
     ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:  # docstyle-ignore
-
         """
         Parameters:
             x: torch.tensor(bs, seq_length, dim) Input sequence embedded.
diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py
index 984931bb4003ea..24e2c7e3987e07 100644
--- a/src/transformers/models/distilbert/modeling_flax_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py
@@ -16,11 +16,10 @@
 import math
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
@@ -200,7 +199,6 @@ def __call__(
         deterministic: bool = True,
         output_attentions: bool = False,
     ):
-
         bs, q_len, dim = query.shape
         k_len = key.shape[1]
         # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
@@ -429,7 +427,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index 612b96c83da38a..76582ae4eab1cd 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -147,7 +147,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -199,7 +199,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
index 67763ad36e943b..dd9dcd165d4109 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
@@ -140,7 +140,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
index 7503f035eacc28..d80e9d890ddc30 100644
--- a/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
+++ b/src/transformers/models/dit/convert_dit_unilm_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import BeitConfig, BeitFeatureExtractor, BeitForImageClassification, BeitForMaskedImageModeling
 from transformers.image_utils import PILImageResampling
 from transformers.utils import logging
diff --git a/src/transformers/models/donut/configuration_donut_swin.py b/src/transformers/models/donut/configuration_donut_swin.py
index d3316bdc79f685..02bd0f72ecb93c 100644
--- a/src/transformers/models/donut/configuration_donut_swin.py
+++ b/src/transformers/models/donut/configuration_donut_swin.py
@@ -113,7 +113,7 @@ def __init__(
         patch_norm=True,
         initializer_range=0.02,
         layer_norm_eps=1e-5,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/donut/convert_donut_to_pytorch.py b/src/transformers/models/donut/convert_donut_to_pytorch.py
index 507f10cb776cf0..cd0f43773567f1 100644
--- a/src/transformers/models/donut/convert_donut_to_pytorch.py
+++ b/src/transformers/models/donut/convert_donut_to_pytorch.py
@@ -18,8 +18,8 @@
 
 import torch
 from datasets import load_dataset
-
 from donut import DonutModel
+
 from transformers import (
     DonutFeatureExtractor,
     DonutProcessor,
diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py
index 835fdb9f64a166..325a2bb9b60215 100644
--- a/src/transformers/models/donut/image_processing_donut.py
+++ b/src/transformers/models/donut/image_processing_donut.py
@@ -102,7 +102,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
 
@@ -207,7 +207,7 @@ def thumbnail(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
@@ -248,7 +248,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -275,7 +275,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -296,7 +296,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
@@ -330,7 +330,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py
index 9c5ab57dc4e86b..bb9e863a36c02a 100644
--- a/src/transformers/models/donut/modeling_donut_swin.py
+++ b/src/transformers/models/donut/modeling_donut_swin.py
@@ -675,7 +675,6 @@ def forward(
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
             layer_outputs = layer_module(
diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py
index cfbf296994b739..5551883e09645e 100644
--- a/src/transformers/models/dpr/configuration_dpr.py
+++ b/src/transformers/models/dpr/configuration_dpr.py
@@ -126,7 +126,7 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         projection_dim: int = 0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index 471376322eda25..a551e507300b0d 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -170,7 +170,6 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 
 class DPREncoder(DPRPreTrainedModel):
-
     base_model_prefix = "bert_model"
 
     def __init__(self, config: DPRConfig):
@@ -227,7 +226,6 @@ def embeddings_size(self) -> int:
 
 
 class DPRSpanPredictor(DPRPreTrainedModel):
-
     base_model_prefix = "encoder"
 
     def __init__(self, config: DPRConfig):
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 28aba892e9b8f6..565ad37b2117e8 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -146,7 +146,6 @@ class TFDPRReaderOutput(ModelOutput):
 
 
 class TFDPREncoderLayer(tf.keras.layers.Layer):
-
     base_model_prefix = "bert_model"
 
     def __init__(self, config: DPRConfig, **kwargs):
@@ -210,7 +209,6 @@ def embeddings_size(self) -> int:
 
 
 class TFDPRSpanPredictorLayer(tf.keras.layers.Layer):
-
     base_model_prefix = "encoder"
 
     def __init__(self, config: DPRConfig, **kwargs):
@@ -275,7 +273,6 @@ def call(
 
 
 class TFDPRSpanPredictor(TFPreTrainedModel):
-
     base_model_prefix = "encoder"
 
     def __init__(self, config: DPRConfig, **kwargs):
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index 7cd01a18fc06db..a14133459b7e83 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -230,7 +230,7 @@ def __call__(
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_attention_mask: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if titles is None and texts is None:
             return super().__call__(
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 280f856a174ba0..507cd2bc40bcea 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -231,7 +231,7 @@ def __call__(
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_attention_mask: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if titles is None and texts is None:
             return super().__call__(
diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py
index 034f1b7e166ec2..7f2dd2e807b702 100644
--- a/src/transformers/models/dpt/configuration_dpt.py
+++ b/src/transformers/models/dpt/configuration_dpt.py
@@ -152,7 +152,7 @@ def __init__(
         backbone_featmap_shape=[1, 1024, 24, 24],
         neck_ignore_stages=[0, 1],
         backbone_config=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
index bccc82bb2b3683..a563436b13c874 100644
--- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_url
 from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
 from transformers.utils import logging
 
diff --git a/src/transformers/models/dpt/convert_dpt_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
index dc26d017d73644..7ef5f7cf119a1b 100644
--- a/src/transformers/models/dpt/convert_dpt_to_pytorch.py
+++ b/src/transformers/models/dpt/convert_dpt_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import cached_download, hf_hub_url
 from PIL import Image
 
-import requests
-from huggingface_hub import cached_download, hf_hub_url
 from transformers import DPTConfig, DPTFeatureExtractor, DPTForDepthEstimation, DPTForSemanticSegmentation
 from transformers.utils import logging
 
diff --git a/src/transformers/models/dpt/image_processing_dpt.py b/src/transformers/models/dpt/image_processing_dpt.py
index 2bc57c9a2afe07..d6bcfe9c5e3d13 100644
--- a/src/transformers/models/dpt/image_processing_dpt.py
+++ b/src/transformers/models/dpt/image_processing_dpt.py
@@ -135,7 +135,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 384, "width": 384}
@@ -159,7 +159,7 @@ def resize(
         ensure_multiple_of: int = 1,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
@@ -199,7 +199,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -220,7 +220,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py
index 87f8d026891df3..187a6c36656a8e 100755
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@@ -379,7 +379,6 @@ def __init__(self, config: DPTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -438,7 +437,6 @@ def __init__(self, config: DPTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/efficientformer/configuration_efficientformer.py b/src/transformers/models/efficientformer/configuration_efficientformer.py
index 6a2bc31eedb28e..5f30664ff325a0 100644
--- a/src/transformers/models/efficientformer/configuration_efficientformer.py
+++ b/src/transformers/models/efficientformer/configuration_efficientformer.py
@@ -136,7 +136,7 @@ def __init__(
         hidden_act: str = "gelu",
         initializer_range: float = 0.02,
         layer_norm_eps: float = 1e-12,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
index 342f1263d3a811..6f7f1b60669f05 100644
--- a/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/efficientformer/convert_efficientformer_original_pytorch_checkpoint_to_pytorch.py
@@ -22,11 +22,11 @@
 import re
 from pathlib import Path
 
+import requests
 import torch
 from PIL import Image
 from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
 
-import requests
 from transformers import (
     EfficientFormerConfig,
     EfficientFormerForImageClassificationWithTeacher,
diff --git a/src/transformers/models/efficientformer/image_processing_efficientformer.py b/src/transformers/models/efficientformer/image_processing_efficientformer.py
index 963946f26688ef..5694fb166e3c76 100644
--- a/src/transformers/models/efficientformer/image_processing_efficientformer.py
+++ b/src/transformers/models/efficientformer/image_processing_efficientformer.py
@@ -96,7 +96,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -121,7 +121,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -158,7 +158,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
@@ -205,7 +205,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py
index 01a6d1165a2e78..d8e1de0fc97fa4 100644
--- a/src/transformers/models/electra/configuration_electra.py
+++ b/src/transformers/models/electra/configuration_electra.py
@@ -155,7 +155,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 22114e28ef7360..8801b0de9a9de5 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -562,7 +562,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 093a6e69acf059..863ba86d778999 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -15,12 +15,11 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -692,7 +691,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -786,7 +785,6 @@ def __call__(
         return_dict: Optional[bool] = None,
         past_key_values: dict = None,
     ):
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py
index f81bb8f3bf9da8..673c1db6119e4c 100644
--- a/src/transformers/models/electra/tokenization_electra.py
+++ b/src/transformers/models/electra/tokenization_electra.py
@@ -150,7 +150,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -198,7 +198,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py
index 894f41df179225..cf92dd01714f9d 100644
--- a/src/transformers/models/electra/tokenization_electra_fast.py
+++ b/src/transformers/models/electra/tokenization_electra_fast.py
@@ -143,7 +143,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 9f34b8c67e42b9..b27b134ecf29dc 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -384,7 +384,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 36df84f3055341..a500398d67a992 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -317,7 +317,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if input_shape is None:
             input_shape = ((1, 1), (1, 1))
@@ -582,7 +582,6 @@ def decode(
         def _decoder_forward(
             module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
         ):
-
             projection_module = module._get_projection_module()
             decoder_module = module._get_decoder_module()
 
@@ -726,7 +725,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -763,7 +762,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> FlaxPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index c6a8fb0f35c5c0..a9441c32290385 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -157,7 +157,6 @@
 
 
 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
-
     if pad_token_id is None:
         raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
     pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
@@ -348,7 +347,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> TFPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
@@ -573,7 +572,6 @@ def call(
                 )
 
         if encoder_outputs is None:
-
             encoder_inputs = {
                 "input_ids": input_ids,
                 "attention_mask": attention_mask,
diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py
index 71dfde96dafbac..91253ab1384bcc 100644
--- a/src/transformers/models/ernie/configuration_ernie.py
+++ b/src/transformers/models/ernie/configuration_ernie.py
@@ -131,7 +131,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py
index 631dc9f3fadc2c..25e9e2e251d301 100644
--- a/src/transformers/models/ernie/modeling_ernie.py
+++ b/src/transformers/models/ernie/modeling_ernie.py
@@ -497,7 +497,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py
index 6b961b788829b7..e51c5d01f1558c 100644
--- a/src/transformers/models/esm/configuration_esm.py
+++ b/src/transformers/models/esm/configuration_esm.py
@@ -120,7 +120,7 @@ def __init__(
         is_folding_model=False,
         esmfold_config=None,
         vocab_list=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, mask_token_id=mask_token_id, **kwargs)
 
diff --git a/src/transformers/models/esm/convert_esm.py b/src/transformers/models/esm/convert_esm.py
index 996e9eaeed4fe0..6ac74d40e6f957 100644
--- a/src/transformers/models/esm/convert_esm.py
+++ b/src/transformers/models/esm/convert_esm.py
@@ -20,11 +20,11 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-import torch
-
 import esm as esm_module
+import torch
 from esm.esmfold.v1.misc import batch_encode_sequences as esmfold_encode_sequences
 from esm.esmfold.v1.pretrained import esmfold_v1
+
 from transformers.models.esm.configuration_esm import EsmConfig, EsmFoldConfig
 from transformers.models.esm.modeling_esm import (
     EsmForMaskedLM,
diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py
index dc25903743bb4d..56544f4ca62bc4 100755
--- a/src/transformers/models/esm/modeling_esm.py
+++ b/src/transformers/models/esm/modeling_esm.py
@@ -295,7 +295,6 @@ def forward(
         past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -597,7 +596,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
diff --git a/src/transformers/models/esm/modeling_tf_esm.py b/src/transformers/models/esm/modeling_tf_esm.py
index 27476c61b012cb..2bb25ba94d30c5 100644
--- a/src/transformers/models/esm/modeling_tf_esm.py
+++ b/src/transformers/models/esm/modeling_tf_esm.py
@@ -329,7 +329,6 @@ def call(
         output_attentions: Optional[bool] = False,
         training: bool = False,
     ) -> Tuple[tf.Tensor]:
-
         mixed_query_layer = self.query(hidden_states)
 
         # If this is instantiated as a cross-attention module, the keys
@@ -830,7 +829,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
diff --git a/src/transformers/models/esm/openfold_utils/chunk_utils.py b/src/transformers/models/esm/openfold_utils/chunk_utils.py
index 4f68503e99bb68..4b60373438e2d7 100644
--- a/src/transformers/models/esm/openfold_utils/chunk_utils.py
+++ b/src/transformers/models/esm/openfold_utils/chunk_utils.py
@@ -62,6 +62,7 @@ def _get_minimal_slice_set(
 
     end is INCLUSIVE.
     """
+
     # start_edges and end_edges both indicate whether, starting from any given
     # dimension, the start/end index is at the top/bottom edge of the
     # corresponding tensor, modeled as a tree
diff --git a/src/transformers/models/esm/openfold_utils/loss.py b/src/transformers/models/esm/openfold_utils/loss.py
index e9523491d519d7..8c442786dc82ba 100644
--- a/src/transformers/models/esm/openfold_utils/loss.py
+++ b/src/transformers/models/esm/openfold_utils/loss.py
@@ -59,7 +59,7 @@ def compute_predicted_aligned_error(
     boundaries = torch.linspace(0, max_bin, steps=(no_bins - 1), device=logits.device)
 
     aligned_confidence_probs = torch.nn.functional.softmax(logits, dim=-1)
-    (predicted_aligned_error, max_predicted_aligned_error,) = _calculate_expected_aligned_error(
+    predicted_aligned_error, max_predicted_aligned_error = _calculate_expected_aligned_error(
         alignment_confidence_breaks=boundaries,
         aligned_distance_error_probs=aligned_confidence_probs,
     )
diff --git a/src/transformers/models/esm/openfold_utils/residue_constants.py b/src/transformers/models/esm/openfold_utils/residue_constants.py
index 6cab95652c63b3..8f0ad3b50c6505 100644
--- a/src/transformers/models/esm/openfold_utils/residue_constants.py
+++ b/src/transformers/models/esm/openfold_utils/residue_constants.py
@@ -399,11 +399,13 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis
 
 
 @functools.lru_cache(maxsize=None)
-def load_stereo_chemical_props() -> Tuple[
-    Mapping[str, List[Bond]],
-    Mapping[str, List[Bond]],
-    Mapping[str, List[BondAngle]],
-]:
+def load_stereo_chemical_props() -> (
+    Tuple[
+        Mapping[str, List[Bond]],
+        Mapping[str, List[Bond]],
+        Mapping[str, List[BondAngle]],
+    ]
+):
     """Load stereo_chemical_props.txt into a nice structure.
 
     Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py
index 7d9c60338516e3..ba6d79891fa90d 100644
--- a/src/transformers/models/flaubert/configuration_flaubert.py
+++ b/src/transformers/models/flaubert/configuration_flaubert.py
@@ -180,7 +180,7 @@ def __init__(
         lang_id=0,
         pad_token_id=2,
         bos_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         """Constructs FlaubertConfig."""
         self.pre_norm = pre_norm
diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py
index 8e48a21285eac1..bb7790730115c3 100644
--- a/src/transformers/models/flaubert/modeling_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_flaubert.py
@@ -98,7 +98,6 @@ def get_masks(slen, lengths, causal, padding_mask=None):
 
 # Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention
 class MultiHeadAttention(nn.Module):
-
     NEW_ID = itertools.count()
 
     def __init__(self, n_heads, dim, config):
diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py
index d52f41a57d65c5..919cd6cc1e4227 100644
--- a/src/transformers/models/flaubert/modeling_tf_flaubert.py
+++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -826,7 +826,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFFlaubertWithLMHeadModelOutput]:
-
         transformer_outputs = self.transformer(
             input_ids=input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index 5a0ac59c39dfd0..26f68e75d70579 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -245,9 +245,8 @@ def __init__(
         ],
         lang2id=None,
         id2lang=None,
-        **kwargs
+        **kwargs,
     ):
-
         do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
         if do_lowercase_and_remove_accent is not None:
             logger.warning(
diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py
index e74101203c501a..da2c9cc95cfbb3 100644
--- a/src/transformers/models/flava/configuration_flava.py
+++ b/src/transformers/models/flava/configuration_flava.py
@@ -109,7 +109,7 @@ def __init__(
         qkv_bias: bool = True,
         mask_token: bool = True,
         vocab_size: int = 8192,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -131,7 +131,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the image config dict if we are loading from FlavaConfig
@@ -237,7 +236,7 @@ def __init__(
         layer_norm_eps: float = 1e-12,
         pad_token_id: int = 0,
         qkv_bias: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -259,7 +258,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from FlavaConfig
@@ -343,7 +341,7 @@ def __init__(
         layer_norm_eps: float = 1e-12,
         qkv_bias: bool = True,
         use_cls_token: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -444,7 +442,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the image codebook config dict if we are loading from FlavaConfig
@@ -555,7 +552,7 @@ def __init__(
         global_backprop_contrastive: bool = True,
         skip_unmasked_multimodal_encoder: bool = True,
         return_loss: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -621,7 +618,7 @@ def from_configs(
         text_config: FlavaTextConfig,
         multimodal_config: FlavaMultimodalConfig,
         image_codebook_config: FlavaImageCodebookConfig,
-        **kwargs
+        **kwargs,
     ):
         r"""
         Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model
diff --git a/src/transformers/models/flava/image_processing_flava.py b/src/transformers/models/flava/image_processing_flava.py
index c41bb37e72377c..9f54306112fe3a 100644
--- a/src/transformers/models/flava/image_processing_flava.py
+++ b/src/transformers/models/flava/image_processing_flava.py
@@ -254,7 +254,7 @@ def __init__(
         codebook_do_normalize: bool = True,
         codebook_image_mean: Optional[Union[float, Iterable[float]]] = None,
         codebook_image_std: Optional[Union[float, Iterable[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -338,7 +338,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -365,7 +365,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
@@ -389,7 +389,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -410,7 +410,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py
index 52f34eca18447a..720e0659725d54 100644
--- a/src/transformers/models/flava/modeling_flava.py
+++ b/src/transformers/models/flava/modeling_flava.py
@@ -509,7 +509,6 @@ def __init__(self, config: FlavaPossibleConfigs) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -569,7 +568,6 @@ def __init__(self, config: FlavaPossibleConfigs) -> None:
 
     # Copied from transformers.models.vit.modeling_vit.ViTIntermediate.forward
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 20562c84569ea1..e13716330fccca 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -78,7 +78,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         This method uses [`FlavaImageProcessor.__call__`] method to prepare image(s) for the model, and
diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py
index 29dc4c0f9126fe..9efa06487756dd 100644
--- a/src/transformers/models/fnet/configuration_fnet.py
+++ b/src/transformers/models/fnet/configuration_fnet.py
@@ -103,7 +103,7 @@ def __init__(
         pad_token_id=3,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
index 27b6563e5dd970..f77a44874ae429 100644
--- a/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fnet/convert_fnet_original_flax_checkpoint_to_pytorch.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
-
 from flax.training.checkpoints import restore_checkpoint
+
 from transformers import FNetConfig, FNetForPreTraining
 from transformers.utils import logging
 
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index 75684025443b04..ebc58167b51725 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -185,7 +185,6 @@ def _init_fourier_transform(self, config):
             self.fourier_transform = fftn
 
     def forward(self, hidden_states):
-
         # NOTE: We do not use torch.vmap as it is not integrated into PyTorch stable versions.
         # Interested users can modify the code to use vmap from the nightly versions, getting the vmap from here:
         # https://pytorch.org/docs/master/generated/torch.vmap.html. Note that fourier transform methods will need
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index e7e3adfd793a50..6edcec45b5b03a 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -113,7 +113,7 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
@@ -238,7 +238,7 @@ def _decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         spaces_between_special_tokens: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
index 7cbd339c8b58b2..e71dbb8977acc2 100644
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@@ -104,7 +104,7 @@ def __init__(
         pad_token="<pad>",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it and
         # is included in the raw text, there should be a match in a non-normalized sentence.
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index 85775d07b2e315..decfb1b90f9cba 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -171,7 +171,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **common_kwargs
+        **common_kwargs,
     ):
         self.langs = langs
         self.src_vocab_size = src_vocab_size
diff --git a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
index 85f5290a9ebd21..ef2764f0ed10ba 100755
--- a/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/fsmt/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py
@@ -88,7 +88,6 @@ def rewrite_dict_keys(d):
 
 
 def convert_fsmt_checkpoint_to_pytorch(fsmt_checkpoint_path, pytorch_dump_folder_path):
-
     # prep
     assert os.path.exists(fsmt_checkpoint_path)
     os.makedirs(pytorch_dump_folder_path, exist_ok=True)
diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py
index ac22dea6fcc079..408967bd0590e3 100644
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -1273,7 +1273,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 9b9dc0bc956ccd..1c401c1faa9ee3 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -195,7 +195,7 @@ def __init__(
         bos_token="<s>",
         sep_token="</s>",
         pad_token="<pad>",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             langs=langs,
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index ddaa7d9b5ed493..8d87ae23a77045 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -124,7 +124,7 @@ def __init__(
         separate_cls=True,
         truncate_seq=True,
         pool_q_only=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.block_sizes = block_sizes
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index e2def568d1cdba..2b109cdbab8a2f 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -771,7 +771,6 @@ def call(
         return_dict=None,
         training=False,
     ):
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -1177,7 +1176,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFBaseModelOutput]:
-
         return self.funnel(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1225,7 +1223,7 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple[tf.Tensor], TFFunnelForPreTrainingOutput]:
         r"""
         Returns:
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index 476fec51a8a45b..245694bfac52ef 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -155,7 +155,7 @@ def __init__(
         eos_token="</s>",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -209,7 +209,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py
index 60be9fbcd76956..864303eb210153 100644
--- a/src/transformers/models/funnel/tokenization_funnel_fast.py
+++ b/src/transformers/models/funnel/tokenization_funnel_fast.py
@@ -158,7 +158,7 @@ def __init__(
         tokenize_chinese_chars=True,
         strip_accents=None,
         wordpieces_prefix="##",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py
index fc933f32591c52..7840ce63de65b3 100644
--- a/src/transformers/models/git/configuration_git.py
+++ b/src/transformers/models/git/configuration_git.py
@@ -91,7 +91,7 @@ def __init__(
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         initializer_range=0.02,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -109,7 +109,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from GITConfig
@@ -211,7 +210,7 @@ def __init__(
         bos_token_id=101,
         eos_token_id=102,
         num_image_with_embedding=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/git/convert_git_to_pytorch.py b/src/transformers/models/git/convert_git_to_pytorch.py
index e4ebf645ae7afb..e089ec89854f96 100644
--- a/src/transformers/models/git/convert_git_to_pytorch.py
+++ b/src/transformers/models/git/convert_git_to_pytorch.py
@@ -21,12 +21,12 @@
 from pathlib import Path
 
 import numpy as np
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     AutoTokenizer,
     CLIPImageProcessor,
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
index b8c95934f8e7e8..96fa94d9a7df9e 100644
--- a/src/transformers/models/git/modeling_git.py
+++ b/src/transformers/models/git/modeling_git.py
@@ -443,7 +443,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/glpn/configuration_glpn.py b/src/transformers/models/glpn/configuration_glpn.py
index aec3d1c4679527..9951d1615cc622 100644
--- a/src/transformers/models/glpn/configuration_glpn.py
+++ b/src/transformers/models/glpn/configuration_glpn.py
@@ -112,7 +112,7 @@ def __init__(
         decoder_hidden_size=64,
         max_depth=10,
         head_in_index=-1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/glpn/convert_glpn_to_pytorch.py b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
index d083ff8271d802..fa0af4691a1439 100644
--- a/src/transformers/models/glpn/convert_glpn_to_pytorch.py
+++ b/src/transformers/models/glpn/convert_glpn_to_pytorch.py
@@ -19,10 +19,10 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import GLPNConfig, GLPNFeatureExtractor, GLPNForDepthEstimation
 from transformers.utils import logging
 
diff --git a/src/transformers/models/glpn/image_processing_glpn.py b/src/transformers/models/glpn/image_processing_glpn.py
index 0533d4c24271d0..ba279c3c4ff48c 100644
--- a/src/transformers/models/glpn/image_processing_glpn.py
+++ b/src/transformers/models/glpn/image_processing_glpn.py
@@ -57,7 +57,7 @@ def __init__(
         size_divisor: int = 32,
         resample=PILImageResampling.BILINEAR,
         do_rescale: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.do_resize = do_resize
         self.do_rescale = do_rescale
diff --git a/src/transformers/models/gpt2/modeling_flax_gpt2.py b/src/transformers/models/gpt2/modeling_flax_gpt2.py
index 8e375b6dbe6035..2a449360b8893d 100644
--- a/src/transformers/models/gpt2/modeling_flax_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -199,7 +199,6 @@ def __call__(
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         is_cross_attention = key_value_states is not None
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 1a7ba62c4146a2..9197a1f56dd53b 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -844,7 +844,6 @@ def forward(
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -860,7 +859,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index c08e864b8d37f9..62df70cce4b93f 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -163,7 +163,6 @@ def call(
         output_attentions,
         training=False,
     ):
-
         if encoder_hidden_states is not None:
             if not hasattr(self, "q_attn"):
                 raise ValueError(
@@ -230,7 +229,6 @@ def __init__(self, config, scale=False, **kwargs):
         self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
 
         if config.add_cross_attention:
-
             self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True)
             self.ln_cross_attn = tf.keras.layers.LayerNormalization(
                 epsilon=config.layer_norm_epsilon, name="ln_cross_attn"
@@ -365,7 +363,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index 1be35fbfdf4919..c462e45d01e3df 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -165,7 +165,7 @@ def __init__(
         pad_token=None,
         add_prefix_space=False,
         add_bos_token=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index eefd35aa94a7a9..7d7500ee9cca3b 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -133,7 +133,7 @@ def __init__(
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_tf.py b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
index ba6f754373c5c7..4ab4af5b9d66f3 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_tf.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_tf.py
@@ -2,7 +2,6 @@
 from typing import Dict, List, Union
 
 import tensorflow as tf
-
 from keras_nlp.tokenizers import BytePairTokenizer
 from tensorflow_text import pad_model_inputs
 
diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
index be48e95d2ccd26..f7d59cfca3c1c4 100644
--- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -116,7 +116,7 @@ def __init__(
         use_cache=True,
         bos_token_id=50256,
         eos_token_id=50256,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -221,7 +221,6 @@ def generate_dummy_inputs(
         is_pair: bool = False,
         framework: Optional[TensorType] = None,
     ) -> Mapping[str, Any]:
-
         common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
             tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
         )
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 7a5a913292e32b..68084f3b7cbbe9 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -219,7 +219,6 @@ def forward(
         use_cache=False,
         output_attentions=False,
     ):
-
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
@@ -596,7 +595,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
index a5ba1fddd93a21..7d2d87e65f710a 100644
--- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py
@@ -103,7 +103,7 @@ def __init__(
         eos_token_id=2,
         tie_word_embeddings=False,
         use_parallel_residual=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 589eaae7804918..7fed1ad556249f 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -315,7 +315,6 @@ def forward(
         layer_past=None,
         output_attentions=False,
     ):
-
         attention_layer_outputs = self.attention(
             self.input_layernorm(hidden_states),
             attention_mask=attention_mask,
@@ -522,7 +521,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -577,7 +575,6 @@ def custom_forward(*inputs):
     """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING
 )
 class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel):
-
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
index c08d533835d708..1d4c1cec3a754f 100644
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@@ -106,7 +106,7 @@ def __init__(
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
index d0df7ef44d245e..8d8519b9eae8bf 100644
--- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
@@ -101,7 +101,7 @@ def __init__(
         eos_token_id=31999,
         attention_dropout=0.1,
         hidden_dropout=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 76df8beee9cc71..8badb60f97b550 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -590,7 +590,6 @@ def forward(
     GPT_NEOX_JAPANESE_START_DOCSTRING,
 )
 class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel):
-
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index a132d999a31370..c9f4f677cb483e 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -129,7 +129,7 @@ def __init__(
         bos_token="<|startoftext|>",
         eos_token="<|endoftext|>",
         do_clean_text=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             unk_token=unk_token,
diff --git a/src/transformers/models/gpt_sw3/__init__.py b/src/transformers/models/gpt_sw3/__init__.py
index c9e6dca3ef6efa..3d5245364a3aed 100644
--- a/src/transformers/models/gpt_sw3/__init__.py
+++ b/src/transformers/models/gpt_sw3/__init__.py
@@ -33,7 +33,6 @@
 
 
 if TYPE_CHECKING:
-
     try:
         if not is_sentencepiece_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
index bb2025381c72f2..f982c5b6b17164 100644
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -115,9 +115,8 @@ def __init__(
         eos_token=None,
         bos_token=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
-
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
         name_or_path = kwargs.get("name_or_path")
diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py
index dd0976cb293183..b40861c354be76 100644
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -112,7 +112,7 @@ def __init__(
         bos_token_id=50256,
         eos_token_id=50256,
         tie_word_embeddings=False,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
diff --git a/src/transformers/models/gptj/modeling_flax_gptj.py b/src/transformers/models/gptj/modeling_flax_gptj.py
index 1f00893b5c873b..6270355129ff27 100644
--- a/src/transformers/models/gptj/modeling_flax_gptj.py
+++ b/src/transformers/models/gptj/modeling_flax_gptj.py
@@ -16,11 +16,10 @@
 from functools import partial
 from typing import Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -212,7 +211,6 @@ def __call__(
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 84282fb0736781..3459a93b5dd43a 100755
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -152,7 +152,6 @@ def _attn(
         attention_mask=None,
         head_mask=None,
     ):
-
         # compute causal mask from causal mask buffer
         query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
@@ -199,7 +198,6 @@ def forward(
         Tuple[torch.Tensor, Tuple[torch.Tensor]],
         Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
     ]:
-
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
@@ -627,7 +625,6 @@ def forward(
         all_self_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -643,7 +640,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/gptj/modeling_tf_gptj.py b/src/transformers/models/gptj/modeling_tf_gptj.py
index 244d4d3ba951b8..f077a52a03ae02 100644
--- a/src/transformers/models/gptj/modeling_tf_gptj.py
+++ b/src/transformers/models/gptj/modeling_tf_gptj.py
@@ -386,7 +386,6 @@ def call(
         return_dict=None,
         training=False,
     ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/graphormer/algos_graphormer.pyx b/src/transformers/models/graphormer/algos_graphormer.pyx
index 981a2b6299ea85..a0fafbdee53b55 100644
--- a/src/transformers/models/graphormer/algos_graphormer.pyx
+++ b/src/transformers/models/graphormer/algos_graphormer.pyx
@@ -4,7 +4,6 @@
 import cython
 
 cimport numpy
-
 from cython.parallel cimport parallel, prange
 
 import numpy as np
diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py
index 82b8b9f87620db..c5d293a6135077 100755
--- a/src/transformers/models/graphormer/modeling_graphormer.py
+++ b/src/transformers/models/graphormer/modeling_graphormer.py
@@ -799,7 +799,7 @@ def forward(
         perturb=None,
         masked_tokens=None,
         return_dict: Optional[bool] = None,
-        **unused
+        **unused,
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py
index 98526caa635fe9..ded75904c55532 100644
--- a/src/transformers/models/groupvit/configuration_groupvit.py
+++ b/src/transformers/models/groupvit/configuration_groupvit.py
@@ -108,7 +108,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -127,7 +127,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from GroupViTConfig
@@ -221,7 +220,7 @@ def __init__(
         initializer_factor=1.0,
         assign_eps=1.0,
         assign_mlp_ratio=[0.5, 4],
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -251,7 +250,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from GroupViTConfig
@@ -303,7 +301,7 @@ def __init__(
         projection_dim=256,
         projection_intermediate_dim=4096,
         logit_scale_init_value=2.6592,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -392,7 +390,6 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
diff --git a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
index e83bdd35cb37cb..059f10f6129bee 100644
--- a/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
+++ b/src/transformers/models/groupvit/convert_groupvit_nvlab_to_hf.py
@@ -21,10 +21,10 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import CLIPProcessor, GroupViTConfig, GroupViTModel
 
 
diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
index 8a5f247e6537fa..023f9370aa2fda 100644
--- a/src/transformers/models/groupvit/modeling_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -195,7 +195,6 @@ def __init__(self, config: GroupViTVisionConfig):
         self.assign_eps = config.assign_eps
 
     def get_attn(self, attn, gumbel=True, hard=True):
-
         if gumbel and self.training:
             attn = gumbel_softmax(attn, dim=-2, hard=hard)
         else:
@@ -931,7 +930,6 @@ def forward(
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[tuple, BaseModelOutput]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py
index 006bd868c9b7f1..3826b83e7a4637 100644
--- a/src/transformers/models/groupvit/modeling_tf_groupvit.py
+++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py
@@ -291,7 +291,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs):
         self.assign_eps = config.assign_eps
 
     def get_attn(self, attn: tf.Tensor, gumbel: bool = True, hard: bool = True, training: bool = False) -> tf.Tensor:
-
         if gumbel and training:
             attn = gumbel_softmax(attn, dim=-2, hard=hard)
         else:
@@ -474,7 +473,6 @@ def __init__(self, config: GroupViTVisionConfig, **kwargs):
         self.config = config
 
     def build(self, input_shape: tf.TensorShape):
-
         num_patches = self.patch_embeddings.num_patches
         self.position_embeddings = self.add_weight(
             shape=(1, num_patches, self.config.hidden_size),
@@ -540,7 +538,6 @@ def __init__(self, config: GroupViTTextConfig, **kwargs):
         self.config = config
 
     def build(self, input_shape: tf.TensorShape):
-
         with tf.name_scope("token_embedding"):
             self.weight = self.add_weight(
                 shape=(self.config.vocab_size, self.embed_dim),
@@ -1104,7 +1101,6 @@ def call(
         return_dict: bool,
         training: bool = False,
     ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
-
         embedding_output = self.embeddings(pixel_values)
 
         encoder_outputs = self.encoder(
@@ -1202,7 +1198,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1264,7 +1259,6 @@ def __init__(self, config: GroupViTConfig, **kwargs):
         ]
 
     def build(self, input_shape: tf.TensorShape):
-
         self.logit_scale = self.add_weight(
             shape=(1,),
             initializer=tf.keras.initializers.Constant(self.config.logit_scale_init_value),
@@ -1285,7 +1279,6 @@ def get_text_features(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> tf.Tensor:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
 
@@ -1320,7 +1313,6 @@ def get_image_features(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> tf.Tensor:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -1353,7 +1345,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFGroupViTModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is None:
             raise ValueError("You have to specify either input_ids")
         if pixel_values is None:
diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py
index 479446787a3fe2..80c6cb6d63e133 100644
--- a/src/transformers/models/herbert/tokenization_herbert.py
+++ b/src/transformers/models/herbert/tokenization_herbert.py
@@ -320,9 +320,8 @@ def __init__(
         ],
         lang2id=None,
         id2lang=None,
-        **kwargs
+        **kwargs,
     ):
-
         super().__init__(
             unk_token=unk_token,
             bos_token=bos_token,
@@ -486,7 +485,6 @@ def bpe(self, token):
         return word
 
     def _tokenize(self, text):
-
         pre_tokens = self.bert_pre_tokenizer.tokenize(text)
 
         split_tokens = []
diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py
index 234ad4a5679190..67e38c1c5ee7bd 100644
--- a/src/transformers/models/herbert/tokenization_herbert_fast.py
+++ b/src/transformers/models/herbert/tokenization_herbert_fast.py
@@ -72,9 +72,8 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sep_token="</s>",
-        **kwargs
+        **kwargs,
     ):
-
         super().__init__(
             vocab_file,
             merges_file,
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index be2e6bbf4c71ab..139df45bbb791d 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -196,7 +196,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
index d7ba74fedae7b2..571761e022846f 100644
--- a/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
+++ b/src/transformers/models/hubert/convert_distilhubert_original_s3prl_checkpoint_to_pytorch.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
-
 from s3prl.hub import distilhubert
+
 from transformers import HubertConfig, HubertModel, Wav2Vec2FeatureExtractor, logging
 
 
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index a96ef5cf5db44a..b1490557820541 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -1180,7 +1180,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index df34adc66dbec0..f4722532e8a3e3 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -318,7 +318,6 @@ def __init__(
         self._check_axis()
 
     def build(self, input_shape):
-
         self._check_if_input_shape_is_none(input_shape)
         self._set_number_of_groups_for_instance_norm(input_shape)
         self._check_size_of_dimensions(input_shape)
@@ -330,7 +329,6 @@ def build(self, input_shape):
         super().build(input_shape)
 
     def call(self, inputs):
-
         input_shape = tf.keras.backend.int_shape(inputs)
         tensor_input_shape = tf.shape(inputs)
 
@@ -367,7 +365,6 @@ def compute_output_shape(self, input_shape):
         return input_shape
 
     def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
-
         group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
         if not is_instance_norm:
@@ -380,7 +377,6 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
             return inputs, group_shape
 
     def _apply_normalization(self, reshaped_inputs, input_shape):
-
         group_shape = tf.keras.backend.int_shape(reshaped_inputs)
         group_reduction_axes = list(range(1, len(group_shape)))
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
@@ -432,7 +428,6 @@ def _set_number_of_groups_for_instance_norm(self, input_shape):
             self.groups = dim
 
     def _check_size_of_dimensions(self, input_shape):
-
         dim = input_shape[self.axis]
         if dim < self.groups:
             raise ValueError(
@@ -453,19 +448,16 @@ def _check_size_of_dimensions(self, input_shape):
             )
 
     def _check_axis(self):
-
         if self.axis == 0:
             raise ValueError(
                 "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
             )
 
     def _create_input_spec(self, input_shape):
-
         dim = input_shape[self.axis]
         self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
 
     def _add_gamma_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -481,7 +473,6 @@ def _add_gamma_weight(self, input_shape):
             self.gamma = None
 
     def _add_beta_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -1647,7 +1638,6 @@ def call(
         logits = self.lm_head(hidden_states)
 
         if labels is not None:
-
             if tf.reduce_max(labels) >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py
index 32d4d2e56a809e..fe46e3fca61539 100644
--- a/src/transformers/models/ibert/configuration_ibert.py
+++ b/src/transformers/models/ibert/configuration_ibert.py
@@ -111,7 +111,7 @@ def __init__(
         position_embedding_type="absolute",
         quant_mode=False,
         force_dequant="none",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py
index fa657924645e93..8e2f123c578c0b 100644
--- a/src/transformers/models/ibert/quant_modules.py
+++ b/src/transformers/models/ibert/quant_modules.py
@@ -163,7 +163,6 @@ def forward(
         specified_min=None,
         specified_max=None,
     ):
-
         x_act = x if identity is None else identity + x
         # collect running stats if training
         if self.training:
@@ -663,7 +662,6 @@ def forward(ctx, x, k, percentile_mode, scale):
 
     @staticmethod
     def backward(ctx, grad_output):
-
         scale = ctx.scale
         if len(grad_output.shape) == 4:
             scale = scale.view(-1, 1, 1, 1)
@@ -771,7 +769,6 @@ def forward(
         identity=None,
         identity_scaling_factor=None,
     ):
-
         if len(pre_act_scaling_factor.shape) == 3:
             reshape = lambda x: x  # noqa: E731
         else:
diff --git a/src/transformers/models/imagegpt/image_processing_imagegpt.py b/src/transformers/models/imagegpt/image_processing_imagegpt.py
index af31fdf1918f33..451b4a265feb0a 100644
--- a/src/transformers/models/imagegpt/image_processing_imagegpt.py
+++ b/src/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -91,7 +91,7 @@ def __init__(
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         do_normalize: bool = True,
         do_color_quantize: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 256, "width": 256}
@@ -109,7 +109,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]).
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 4e52ef5c071d66..d7537c9dc35534 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -796,7 +796,6 @@ def forward(
         all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
         all_hidden_states = () if output_hidden_states else None
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-
             # Model parallel
             if self.model_parallel:
                 torch.cuda.set_device(hidden_states.device)
@@ -812,7 +811,6 @@ def forward(
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/jukebox/configuration_jukebox.py b/src/transformers/models/jukebox/configuration_jukebox.py
index 6ce345a8578e28..e705af931ec301 100644
--- a/src/transformers/models/jukebox/configuration_jukebox.py
+++ b/src/transformers/models/jukebox/configuration_jukebox.py
@@ -301,7 +301,7 @@ def __init__(
         spread=None,
         timing_dims=64,
         zero_out=False,
-        **kwargs
+        **kwargs,
     ):
         self.act_fn = act_fn
         self.alignment_head = alignment_head
@@ -459,7 +459,7 @@ def __init__(
         sample_length=1058304,
         init_scale=0.2,
         zero_out=False,
-        **kwargs
+        **kwargs,
     ):
         self.hop_fraction = hop_fraction
         self.conv_input_shape = conv_input_shape
@@ -486,7 +486,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from CLIPConfig
@@ -576,7 +575,6 @@ def __init__(
         init_std=0.2,
         **kwargs,
     ):
-
         if vqvae_config is None:
             vqvae_config = {}
             logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
diff --git a/src/transformers/models/jukebox/convert_jukebox.py b/src/transformers/models/jukebox/convert_jukebox.py
index c8d0831e53f3de..8625cbe868883b 100644
--- a/src/transformers/models/jukebox/convert_jukebox.py
+++ b/src/transformers/models/jukebox/convert_jukebox.py
@@ -19,9 +19,9 @@
 import os
 from pathlib import Path
 
+import requests
 import torch
 
-import requests
 from transformers import JukeboxConfig, JukeboxModel
 from transformers.utils import logging
 
@@ -116,7 +116,6 @@ def fix_jukebox_keys(state_dict, model_state_dict, key_prefix, mapping):
     re_prior_cond_proj_in = re.compile("conditioner_blocks.(\d*).cond.model.(\d*).(bias|weight)")
 
     for original_key, value in state_dict.items():
-
         # rename vqvae.encoder keys
         if re_encoder_block_conv_in.fullmatch(original_key):
             regex_match = re_encoder_block_conv_in.match(original_key)
diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py
index 38fed91e1b01bb..2528f1aa2272bf 100755
--- a/src/transformers/models/jukebox/modeling_jukebox.py
+++ b/src/transformers/models/jukebox/modeling_jukebox.py
@@ -1625,7 +1625,6 @@ class JukeboxMusicTokenConditioner(nn.Module):
     """
 
     def __init__(self, config, level):
-
         super().__init__()
         self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size)
         config.embed_dim = config.music_vocab_size  # setting correct argument for the `JukeboxDecoder`
diff --git a/src/transformers/models/jukebox/tokenization_jukebox.py b/src/transformers/models/jukebox/tokenization_jukebox.py
index 01bada0e0806b4..85835c6cdf4201 100644
--- a/src/transformers/models/jukebox/tokenization_jukebox.py
+++ b/src/transformers/models/jukebox/tokenization_jukebox.py
@@ -23,8 +23,8 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-
 import regex
+
 from transformers.utils.generic import _is_jax, _is_numpy
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
@@ -126,7 +126,7 @@ def __init__(
         max_n_lyric_tokens=512,
         n_genres=5,
         unk_token="<|endoftext|>",
-        **kwargs
+        **kwargs,
     ):
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
         super().__init__(
diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py
index c1e24acffa4562..f297adea1420c1 100644
--- a/src/transformers/models/layoutlm/configuration_layoutlm.py
+++ b/src/transformers/models/layoutlm/configuration_layoutlm.py
@@ -112,7 +112,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         max_2d_position_embeddings=1024,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 3b696d778adac3..d23c68840064d3 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -478,7 +478,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 41267ab1f52939..2097ae58b8bf35 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -717,7 +717,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index bc7fbc9bb95baa..99f517c6a2a267 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -132,7 +132,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -180,7 +180,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
index 42d664c061c790..7ba06d7fa1107e 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
@@ -117,7 +117,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
index db14690387aa90..3cc8027c1dd5c8 100644
--- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -145,7 +145,7 @@ def __init__(
         has_spatial_attention_bias=True,
         has_visual_segment_embedding=False,
         detectron2_config_args=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_size=vocab_size,
diff --git a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
index 04547eebd8ed95..ca01b3670d09c2 100644
--- a/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/image_processing_layoutlmv2.py
@@ -136,7 +136,7 @@ def __init__(
         apply_ocr: bool = True,
         ocr_lang: Optional[str] = None,
         tesseract_config: Optional[str] = "",
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -155,7 +155,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 4ddd95bfbe5614..280b93043afeb1 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -85,7 +85,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index 306c4f34f1ff74..c6a8857325f4ee 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -242,7 +242,7 @@ def __init__(
         strip_accents=None,
         model_max_length: int = 512,
         additional_special_tokens: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -304,7 +304,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -444,7 +443,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -463,6 +462,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -596,9 +596,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -655,9 +654,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -779,7 +777,7 @@ def encode(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         encoded_inputs = self.encode_plus(
             text=text,
@@ -826,7 +824,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -892,7 +890,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -946,7 +944,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
@@ -1215,8 +1213,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index a2059dbf743a34..bed4e133aa3c5c 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -136,7 +136,7 @@ def __init__(
         only_label_first_subword=True,
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -197,7 +197,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -216,6 +216,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -349,9 +350,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -413,7 +413,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -484,7 +484,6 @@ def _batch_encode_plus(
         return_length: bool = False,
         verbose: bool = True,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, list):
             raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
@@ -636,9 +635,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # make it a batched input
         # 2 options:
         # 1) only text, in case text must be a list of str
diff --git a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
index 96876ced612e7f..e39e5b61e299bb 100644
--- a/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -153,7 +153,7 @@ def __init__(
         num_channels=3,
         patch_size=16,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_size=vocab_size,
@@ -191,7 +191,6 @@ def __init__(
 
 
 class LayoutLMv3OnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.12")
 
     @property
diff --git a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
index c2cd270846c902..aec81de1a99ec2 100644
--- a/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -157,7 +157,7 @@ def __init__(
         apply_ocr: bool = True,
         ocr_lang: Optional[str] = None,
         tesseract_config: Optional[str] = "",
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -181,7 +181,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to (size["height"], size["width"]) dimensions.
@@ -207,7 +207,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -228,7 +228,7 @@ def normalize(
         mean: Union[float, Iterable[float]],
         std: Union[float, Iterable[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image.
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index 9c6a8416d51f1f..4d406962378005 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -85,7 +85,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index 13a797be523ce4..b9c0ab127d42ca 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -291,7 +291,7 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -566,7 +566,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -585,6 +585,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -719,9 +720,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -779,9 +779,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -905,7 +904,7 @@ def encode(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         encoded_inputs = self.encode_plus(
             text=text,
@@ -953,7 +952,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -1020,7 +1019,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -1074,7 +1073,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
@@ -1346,8 +1345,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
index 98e2ee3e320f27..4bd3e91480c02b 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -156,7 +156,7 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
@@ -243,7 +243,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -262,6 +262,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -396,9 +397,8 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
             padding=padding,
@@ -462,7 +462,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -534,7 +534,6 @@ def _batch_encode_plus(
         return_length: bool = False,
         verbose: bool = True,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, list):
             raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
@@ -687,9 +686,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # make it a batched input
         # 2 options:
         # 1) only text, in case text must be a list of str
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 49fbb1ac3ddc77..250c8c7fc6140c 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -65,7 +65,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index d825bd8f36248f..47c5315457b4fa 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -247,7 +247,7 @@ def __init__(
         pad_token_label=-100,
         only_label_first_subword=True,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -462,7 +462,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -481,6 +481,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -613,9 +614,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -736,7 +736,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -790,7 +790,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 439d3994781c04..322239192740d0 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -233,7 +233,7 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -287,7 +287,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -306,6 +306,7 @@ def __call__(
             word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -448,7 +449,6 @@ def _batch_encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, list):
             raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
@@ -600,9 +600,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         # make it a batched input
         # 2 options:
         # 1) only text, in case text must be a list of str
diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py
index 98d2e32f62e4e8..34c286ce18910f 100644
--- a/src/transformers/models/led/configuration_led.py
+++ b/src/transformers/models/led/configuration_led.py
@@ -132,7 +132,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         attention_window: Union[List[int], int] = 512,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_encoder_position_embeddings = max_encoder_position_embeddings
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 7d018a52d9e073..b60f088de808c1 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -2135,7 +2135,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index e45d94c742f8e9..75d4a15f194d11 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -1769,7 +1769,7 @@ def call(
         if global_attention_mask is not None:
             attention_mask = attention_mask * tf.cast((global_attention_mask + 1), dtype=attention_mask.dtype)
 
-        (padding_len, input_ids, attention_mask, inputs_embeds,) = self._pad_to_window_size(
+        padding_len, input_ids, attention_mask, inputs_embeds = self._pad_to_window_size(
             input_ids=input_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
@@ -1809,7 +1809,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 hidden_states_to_add = self.compute_hidden_states(hidden_states, padding_len)
                 encoder_states = encoder_states + (hidden_states_to_add,)
@@ -2188,9 +2187,8 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -2290,9 +2288,8 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
-
         outputs = self.led(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -2516,7 +2513,7 @@ def prepare_inputs_for_generation(
         decoder_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index 812e374c7a485d..5b22701a226224 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -185,7 +185,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index ee9118f11d680f..153b32c2967bf4 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -148,7 +148,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/levit/configuration_levit.py b/src/transformers/models/levit/configuration_levit.py
index 525221217ad450..06c7925a8f3797 100644
--- a/src/transformers/models/levit/configuration_levit.py
+++ b/src/transformers/models/levit/configuration_levit.py
@@ -105,7 +105,7 @@ def __init__(
         mlp_ratio=[2, 2, 2],
         attention_ratio=[2, 2, 2],
         initializer_range=0.02,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_size = image_size
@@ -130,7 +130,6 @@ def __init__(
 
 # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
 class LevitOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
index de8826ce61d3e0..a6ae6603e5de9f 100644
--- a/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
+++ b/src/transformers/models/levit/convert_levit_timm_to_pytorch.py
@@ -21,10 +21,10 @@
 from functools import partial
 from pathlib import Path
 
-import torch
-
 import timm
+import torch
 from huggingface_hub import hf_hub_download
+
 from transformers import LevitConfig, LevitFeatureExtractor, LevitForImageClassificationWithTeacher
 from transformers.utils import logging
 
diff --git a/src/transformers/models/levit/image_processing_levit.py b/src/transformers/models/levit/image_processing_levit.py
index b369bfd33ad9f0..6aef221a16a8cd 100644
--- a/src/transformers/models/levit/image_processing_levit.py
+++ b/src/transformers/models/levit/image_processing_levit.py
@@ -99,7 +99,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_MEAN,
         image_std: Optional[Union[float, Iterable[float]]] = IMAGENET_DEFAULT_STD,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -124,7 +124,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -168,7 +168,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image.
@@ -191,7 +191,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -212,7 +212,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/lilt/configuration_lilt.py b/src/transformers/models/lilt/configuration_lilt.py
index 16ec1d658f0ed0..d11899c94312ad 100644
--- a/src/transformers/models/lilt/configuration_lilt.py
+++ b/src/transformers/models/lilt/configuration_lilt.py
@@ -111,7 +111,7 @@ def __init__(
         classifier_dropout=None,
         channel_shrink_ratio=4,
         max_2d_position_embeddings=1024,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py
index b372ddd9fca545..6a2b820b4ff8b7 100644
--- a/src/transformers/models/lilt/modeling_lilt.py
+++ b/src/transformers/models/lilt/modeling_lilt.py
@@ -239,7 +239,6 @@ def forward(
         head_mask=None,
         output_attentions=False,
     ):
-
         layout_value_layer = self.transpose_for_scores(self.layout_value(layout_inputs), r=self.channel_shrink_ratio)
         layout_key_layer = self.transpose_for_scores(self.layout_key(layout_inputs), r=self.channel_shrink_ratio)
         layout_query_layer = self.transpose_for_scores(self.layout_query(layout_inputs), r=self.channel_shrink_ratio)
diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py
index 1e8eea812db9cb..3f3e2da7e830e8 100644
--- a/src/transformers/models/longformer/configuration_longformer.py
+++ b/src/transformers/models/longformer/configuration_longformer.py
@@ -133,7 +133,7 @@ def __init__(
         layer_norm_eps: float = 1e-12,
         position_embedding_type: str = "absolute",
         onnx_export: bool = False,
-        **kwargs
+        **kwargs,
     ):
         """Constructs LongformerConfig."""
         super().__init__(pad_token_id=pad_token_id, **kwargs)
diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
index 4d9ebe017a1d86..ed7d32ab3edbef 100644
--- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
+++ b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
@@ -39,7 +39,6 @@ def forward(self):
 def convert_longformer_qa_checkpoint_to_pytorch(
     longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str
 ):
-
     # load longformer model from model identifier
     longformer = LongformerModel.from_pretrained(longformer_model)
     lightning_model = LightningModel(longformer)
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 44c8a5e31c7779..6ad6bdad6711a6 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1886,7 +1886,6 @@ def forward(
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForSequenceClassification(LongformerPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
@@ -2014,7 +2013,6 @@ def forward(self, hidden_states, **kwargs):
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForQuestionAnswering(LongformerPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
@@ -2154,7 +2152,6 @@ def forward(
     LONGFORMER_START_DOCSTRING,
 )
 class LongformerForTokenClassification(LongformerPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 10758ec7005e0c..e5e22a21276faf 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -1692,7 +1692,6 @@ def call(
         return_dict=None,
         training=False,
     ):
-
         if input_ids is not None and not isinstance(input_ids, tf.Tensor):
             input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int64)
         elif input_ids is not None:
@@ -2060,7 +2059,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         outputs = self.longformer(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -2393,7 +2391,6 @@ def call(
         labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
         training: Optional[bool] = False,
     ) -> Union[TFLongformerSequenceClassifierOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and not isinstance(input_ids, tf.Tensor):
             input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int64)
         elif input_ids is not None:
diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py
index 64bbeeb8ce51ae..5ff6f70afdeafb 100644
--- a/src/transformers/models/longformer/tokenization_longformer.py
+++ b/src/transformers/models/longformer/tokenization_longformer.py
@@ -207,7 +207,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index 089ee69d668e9c..5d20caf8c2dfa0 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -188,7 +188,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/longt5/configuration_longt5.py b/src/transformers/models/longt5/configuration_longt5.py
index 705fdc4939584b..0927d13034675b 100644
--- a/src/transformers/models/longt5/configuration_longt5.py
+++ b/src/transformers/models/longt5/configuration_longt5.py
@@ -108,9 +108,8 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
-        **kwargs
+        **kwargs,
     ):
-
         self.vocab_size = vocab_size
         self.d_model = d_model
         self.d_kv = d_kv
diff --git a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
index 41cc3a2005dd94..5a1394c719d2d8 100644
--- a/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
+++ b/src/transformers/models/longt5/convert_longt5x_checkpoint_to_flax.py
@@ -20,6 +20,7 @@
 import argparse
 
 from t5x import checkpoints
+
 from transformers import AutoConfig, FlaxAutoModelForSeq2SeqLM
 
 
diff --git a/src/transformers/models/longt5/modeling_flax_longt5.py b/src/transformers/models/longt5/modeling_flax_longt5.py
index de3e43f02cc5fd..458f6d597f30c8 100644
--- a/src/transformers/models/longt5/modeling_flax_longt5.py
+++ b/src/transformers/models/longt5/modeling_flax_longt5.py
@@ -18,11 +18,10 @@
 import copy
 from typing import Any, Callable, List, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -1681,7 +1680,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -2392,7 +2391,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
index 486eda387686af..1039b1cc5b0374 100644
--- a/src/transformers/models/longt5/modeling_longt5.py
+++ b/src/transformers/models/longt5/modeling_longt5.py
@@ -231,7 +231,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -1181,7 +1180,6 @@ def forward(
         output_attentions=False,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -1348,8 +1346,8 @@ def _shift_right(self, input_ids):
         pad_token_id = self.config.pad_token_id
 
         assert decoder_start_token_id is not None, (
-            "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the"
-            " pad_token_id. See LongT5 docs for more information"
+            "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id."
+            " See LongT5 docs for more information"
         )
 
         # shift inputs to the right
@@ -2096,9 +2094,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py
index 8f7438cc3c6a91..6e5c99900bbdf5 100644
--- a/src/transformers/models/luke/configuration_luke.py
+++ b/src/transformers/models/luke/configuration_luke.py
@@ -114,7 +114,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         """Constructs LukeConfig."""
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 25427f8a63a3cd..8df68e9dd1dceb 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -1022,7 +1022,6 @@ def _set_gradient_checkpointing(self, module, value=False):
     LUKE_START_DOCSTRING,
 )
 class LukeModel(LukePreTrainedModel):
-
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config: LukeConfig, add_pooling_layer: bool = True):
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 9a5f6d42a665d2..ff177a44442c71 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -22,7 +22,6 @@
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
-
 import regex as re
 
 from ...tokenization_utils import PreTrainedTokenizer
@@ -313,9 +312,8 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
-
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
@@ -597,7 +595,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -742,9 +740,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -824,7 +821,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -916,7 +913,6 @@ def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spa
             )
 
         if entities is not None:
-
             if not isinstance(entities, list):
                 raise ValueError("If you specify entities, they should be given as a list")
 
@@ -934,7 +930,7 @@ def _create_input_sequence(
         entities_pair: Optional[EntityInput] = None,
         entity_spans: Optional[EntitySpanInput] = None,
         entity_spans_pair: Optional[EntitySpanInput] = None,
-        **kwargs
+        **kwargs,
     ) -> Tuple[list, list, list, list, list, list]:
         def get_input_ids(text):
             tokens = self.tokenize(text, **kwargs)
@@ -975,7 +971,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
         first_entity_token_spans, second_entity_token_spans = None, None
 
         if self.task is None:
-
             if entity_spans is None:
                 first_ids = get_input_ids(text)
             else:
@@ -1058,7 +1053,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
                 first_ids = first_ids[:entity_token_start] + [special_token_id] + first_ids[entity_token_start:]
 
         elif self.task == "entity_span_classification":
-
             if not (isinstance(entity_spans, list) and len(entity_spans) > 0 and isinstance(entity_spans[0], tuple)):
                 raise ValueError(
                     "Entity spans should be provided as a list of tuples, "
@@ -1187,7 +1181,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py
index 572fa30d8dd172..9fe7ecb730fdac 100644
--- a/src/transformers/models/lxmert/modeling_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_lxmert.py
@@ -526,7 +526,6 @@ def forward(
         visual_attention_mask,
         output_attentions=False,
     ):
-
         lang_att_output, visual_att_output = self.cross_att(
             lang_input=lang_feats,
             lang_attention_mask=lang_attention_mask,
@@ -609,7 +608,6 @@ def forward(
         visual_attention_mask=None,
         output_attentions=None,
     ):
-
         vision_hidden_states = ()
         language_hidden_states = ()
         vision_attentions = () if output_attentions or self.config.output_attentions else None
@@ -916,7 +914,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[LxmertModelOutput, Tuple[torch.FloatTensor]]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1123,7 +1120,6 @@ def _set_qa_logit_layer(self, qa_logit_layer):
         self.answer_head.logit_fc[-1] = qa_logit_layer
 
     def _get_resized_qa_labels(self, cur_qa_logit_layer, num_labels):
-
         if num_labels is None:
             return cur_qa_logit_layer
 
@@ -1355,7 +1351,6 @@ def _set_qa_logit_layer(self, qa_logit_layer):
         self.answer_head.logit_fc[-1] = qa_logit_layer
 
     def _get_resized_qa_labels(self, cur_qa_logit_layer, num_labels):
-
         if num_labels is None:
             return cur_qa_logit_layer
 
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
index a657ddb94b5c23..84d05bebce2e85 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert.py
@@ -124,7 +124,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -172,7 +172,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
index 1b804f5239b202..8e58a3aafac5c1 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
@@ -103,7 +103,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py
index 0ab2365accd34a..453f8d45f3dca5 100644
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -128,7 +128,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index c354b9503d418e..58f0ca289c467f 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -1054,7 +1054,6 @@ def forward(
                 past_key_value = past_key_values[idx] if past_key_values is not None else None
 
                 if self.gradient_checkpointing and self.training:
-
                     if use_cache:
                         logger.warning(
                             "`use_cache=True` is incompatible with gradient checkpointing. Setting"
@@ -1080,7 +1079,6 @@ def custom_forward(*inputs):
                         None,
                     )
                 else:
-
                     layer_outputs = decoder_layer(
                         hidden_states,
                         attention_mask=combined_attention_mask,
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 0a6ad7ec3a15f4..a2fdd41d7442e0 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -133,7 +133,7 @@ def __init__(
         eos_token_id=0,
         forced_eos_token_id=0,
         share_encoder_decoder_embeddings=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.decoder_vocab_size = decoder_vocab_size or vocab_size
diff --git a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
index c175144623985f..f6b548c2b07f46 100644
--- a/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
+++ b/src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py
@@ -20,9 +20,9 @@
 from pathlib import Path
 from typing import Tuple
 
+import yaml
 from tqdm import tqdm
 
-import yaml
 from transformers.models.marian.convert_marian_to_pytorch import (
     FRONT_MATTER_TEMPLATE,
     convert,
diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py
index 1fb5a34f064fd3..1662ffb358b44d 100644
--- a/src/transformers/models/marian/convert_marian_to_pytorch.py
+++ b/src/transformers/models/marian/convert_marian_to_pytorch.py
@@ -24,10 +24,10 @@
 
 import numpy as np
 import torch
+from huggingface_hub.hf_api import list_models
 from torch import nn
 from tqdm import tqdm
 
-from huggingface_hub.hf_api import list_models
 from transformers import MarianConfig, MarianMTModel, MarianTokenizer
 
 
@@ -631,7 +631,7 @@ def load_marian_model(self) -> MarianMTModel:
             model.model.decoder.embed_positions.weight = wpos_tensor
 
         if cfg.normalize_embedding:
-            if not ("encoder_emb_ln_scale_pre" in state_dict):
+            if "encoder_emb_ln_scale_pre" not in state_dict:
                 raise ValueError("encoder_emb_ln_scale_pre is not in state dictionary")
             raise NotImplementedError("Need to convert layernorm_embedding")
 
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index db543ef8d9c940..96b26f8325ce8c 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -19,11 +19,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -883,7 +882,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1440,7 +1439,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 3937058a566016..d4dc4d53dd8485 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -1019,7 +1019,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1044,7 +1043,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index 17a4f0f5d1b587..eecaf95f6cb903 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -817,7 +817,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1133,9 +1132,8 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -1239,7 +1237,7 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
         outputs = self.model(
             input_ids=input_ids,
@@ -1461,9 +1459,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index c688733321be0d..7d2af76fc331c4 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -138,7 +138,7 @@ def __init__(
         model_max_length=512,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         separate_vocabs=False,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
diff --git a/src/transformers/models/markuplm/configuration_markuplm.py b/src/transformers/models/markuplm/configuration_markuplm.py
index aa667fa53b8977..935625b1fa1fa8 100644
--- a/src/transformers/models/markuplm/configuration_markuplm.py
+++ b/src/transformers/models/markuplm/configuration_markuplm.py
@@ -126,7 +126,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py
index b9a3b70326eac5..86c67f7fe427f8 100755
--- a/src/transformers/models/markuplm/modeling_markuplm.py
+++ b/src/transformers/models/markuplm/modeling_markuplm.py
@@ -640,7 +640,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py
index d6251586ac67ca..51307d20eb5f3b 100644
--- a/src/transformers/models/markuplm/processing_markuplm.py
+++ b/src/transformers/models/markuplm/processing_markuplm.py
@@ -66,7 +66,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
index f7d0e445d0281e..f4eb0af4d9b756 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -220,7 +220,7 @@ def __init__(
         pad_width=1001,
         pad_token_label=-100,
         only_label_first_subword=True,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -527,7 +527,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -676,7 +676,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
@@ -734,7 +734,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -857,7 +857,7 @@ def encode(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         encoded_inputs = self.encode_plus(
             text=text,
@@ -904,7 +904,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -970,7 +970,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -1024,7 +1024,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
@@ -1315,8 +1315,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
index 4a0a0b9e64a9d3..8c1cb73afdaac7 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
@@ -180,7 +180,7 @@ def __init__(
         pad_token_label=-100,
         only_label_first_subword=True,
         trim_offsets=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file=vocab_file,
@@ -302,7 +302,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -321,6 +321,7 @@ def __call__(
             node_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Node-level integer labels (for token classification tasks).
         """
+
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -450,7 +451,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
@@ -513,7 +514,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
@@ -736,7 +737,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # make it a batched input
         # 2 options:
diff --git a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
index b53e695db7ebc2..ea3e530ded5216 100644
--- a/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mask2former/convert_mask2former_original_pytorch_checkpoint_to_pytorch.py
@@ -20,16 +20,16 @@
 from pprint import pformat
 from typing import Any, Dict, Iterator, List, Set, Tuple
 
+import requests
 import torch
 import torchvision.transforms as T
-from PIL import Image
-from torch import Tensor, nn
-
-import requests
 from detectron2.checkpoint import DetectionCheckpointer
 from detectron2.config import get_cfg
 from detectron2.projects.deeplab import add_deeplab_config
 from huggingface_hub import hf_hub_download
+from PIL import Image
+from torch import Tensor, nn
+
 from transformers import (
     Mask2FormerConfig,
     Mask2FormerForUniversalSegmentation,
@@ -624,7 +624,6 @@ def rename_keys_in_masked_attention_decoder(self, dst_state_dict: StateDict, src
 
         rename_keys = []
         for i in range(self.config.decoder_layers - 1):
-
             rename_keys.append(
                 (
                     f"{src_prefix}.transformer_self_attention_layers.{i}.self_attn.out_proj.weight",
diff --git a/src/transformers/models/mask2former/image_processing_mask2former.py b/src/transformers/models/mask2former/image_processing_mask2former.py
index 01c267cdd48c53..eb93391fb354f8 100644
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@@ -391,7 +391,7 @@ def __init__(
         image_std: Union[float, List[float]] = None,
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ):
         if "size_divisibility" in kwargs:
             warnings.warn(
@@ -466,7 +466,7 @@ def resize(
         size_divisor: int = 0,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format=None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
@@ -644,7 +644,7 @@ def preprocess(
         reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             warnings.warn(
@@ -789,7 +789,7 @@ def encode_inputs(
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 6fed5b05955bf8..ca9712639df2a9 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -1596,7 +1596,6 @@ def forward_post(
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
     ):
-
         # Masked(Cross)-Attention Block
         cross_attn_weights = None
         self_attn_weights = None
@@ -1656,7 +1655,6 @@ def forward_pre(
         encoder_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
     ):
-
         # Masked(Cross)-Attention Block
         cross_attn_weights = None
         self_attn_weights = None
@@ -2016,7 +2014,6 @@ def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Te
         self.mask_embedder = Mask2FormerMLPPredictionHead(self.hidden_size, self.hidden_size, mask_feature_size)
 
     def forward(self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: int = None):
-
         mask_embeddings = self.mask_embedder(outputs.transpose(0, 1))
 
         # Sum up over the channels
@@ -2063,7 +2060,6 @@ def forward(
         output_hidden_states: bool = False,
         output_attentions: bool = False,
     ) -> Mask2FormerMaskedAttentionDecoderOutput:
-
         multi_stage_features = []
         multi_stage_positional_embeddings = []
         size_list = []
diff --git a/src/transformers/models/maskformer/configuration_maskformer_swin.py b/src/transformers/models/maskformer/configuration_maskformer_swin.py
index 36e0746552c853..e48e3d120129d8 100644
--- a/src/transformers/models/maskformer/configuration_maskformer_swin.py
+++ b/src/transformers/models/maskformer/configuration_maskformer_swin.py
@@ -113,7 +113,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         out_features=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
index c08591e044db9e..d56777d4527439 100644
--- a/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_original_pytorch_checkpoint_to_pytorch.py
@@ -19,16 +19,16 @@
 from pprint import pformat
 from typing import Any, Dict, Iterator, List, Set, Tuple
 
+import requests
 import torch
 import torchvision.transforms as T
-from PIL import Image
-from torch import Tensor, nn
-
-import requests
 from detectron2.checkpoint import DetectionCheckpointer
 from detectron2.config import get_cfg
 from detectron2.data import MetadataCatalog
 from detectron2.projects.deeplab import add_deeplab_config
+from PIL import Image
+from torch import Tensor, nn
+
 from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerFeatureExtractor
 from transformers.models.maskformer.modeling_maskformer import (
     MaskFormerConfig,
@@ -106,7 +106,6 @@ def setup_cfg(args: Args):
 
 class OriginalMaskFormerConfigToOursConverter:
     def __call__(self, original_config: object) -> MaskFormerConfig:
-
         model = original_config.MODEL
         mask_former = model.MASK_FORMER
         swin = model.SWIN
@@ -557,7 +556,6 @@ def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object
 
 def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_extractor: MaskFormerFeatureExtractor):
     with torch.no_grad():
-
         original_model = original_model.eval()
         our_model = our_model.eval()
 
@@ -583,7 +581,6 @@ def test(original_model, our_model: MaskFormerForInstanceSegmentation, feature_e
         for original_model_feature, our_model_feature in zip(
             original_model_backbone_features.values(), our_model_output.encoder_hidden_states
         ):
-
             assert torch.allclose(
                 original_model_feature, our_model_feature, atol=1e-3
             ), "The backbone features are not the same."
@@ -635,7 +632,6 @@ def get_name(checkpoint_file: Path):
 
 
 if __name__ == "__main__":
-
     parser = ArgumentParser(
         description="Command line to convert the original maskformers (with swin backbone) to our implementations."
     )
@@ -690,7 +686,6 @@ def get_name(checkpoint_file: Path):
     for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs(
         checkpoints_dir, config_dir
     ):
-
         feature_extractor = OriginalMaskFormerConfigToFeatureExtractorConverter()(
             setup_cfg(Args(config_file=config_file))
         )
diff --git a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
index f0f69f9aa83681..0657de9ee69e0d 100644
--- a/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py
@@ -21,11 +21,11 @@
 import pickle
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, ResNetConfig
 from transformers.utils import logging
 
diff --git a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
index 59606b1a409ad0..4ec5f64f22ae28 100644
--- a/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
+++ b/src/transformers/models/maskformer/convert_maskformer_swin_to_pytorch.py
@@ -21,11 +21,11 @@
 import pickle
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import MaskFormerConfig, MaskFormerFeatureExtractor, MaskFormerForInstanceSegmentation, SwinConfig
 from transformers.utils import logging
 
diff --git a/src/transformers/models/maskformer/image_processing_maskformer.py b/src/transformers/models/maskformer/image_processing_maskformer.py
index 2a59a0f4db5719..6c3119fd30bff5 100644
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@@ -395,7 +395,7 @@ def __init__(
         image_std: Union[float, List[float]] = None,
         ignore_index: Optional[int] = None,
         do_reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ):
         if "size_divisibility" in kwargs:
             warnings.warn(
@@ -486,7 +486,7 @@ def resize(
         size_divisor: int = 0,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format=None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
@@ -548,7 +548,7 @@ def convert_segmentation_map_to_binary_masks(
         instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ):
         reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
         ignore_index = ignore_index if ignore_index is not None else self.ignore_index
@@ -665,7 +665,7 @@ def preprocess(
         do_reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             warnings.warn(
@@ -820,7 +820,7 @@ def encode_inputs(
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index 83f741dbf7b6d5..1a775f57fdfb91 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -134,7 +134,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 42660e260754c6..78375afce4fccc 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -19,11 +19,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -953,7 +952,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1507,7 +1506,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index ca440038960791..2548b08ed10bc7 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1068,7 +1068,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
@@ -1093,7 +1092,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1398,7 +1396,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/mbart/modeling_tf_mbart.py b/src/transformers/models/mbart/modeling_tf_mbart.py
index b222983331fb27..0f6e18b0b38616 100644
--- a/src/transformers/models/mbart/modeling_tf_mbart.py
+++ b/src/transformers/models/mbart/modeling_tf_mbart.py
@@ -803,7 +803,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1128,9 +1127,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, tf.Tensor]:
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -1237,9 +1235,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1458,9 +1455,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index b6b4173e50afdd..0c74175e33220e 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -94,9 +94,8 @@ def __init__(
         tgt_lang=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
-
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index 0ac14033a44aa8..b5d24c28dc8ff9 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -107,7 +107,7 @@ def __init__(
         src_lang=None,
         tgt_lang=None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index 0a331b283760fa..628be52479d0c3 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -125,7 +125,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
index 1ab8ff06e2609f..6bf3b48b378ccf 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -122,7 +122,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/mctct/configuration_mctct.py b/src/transformers/models/mctct/configuration_mctct.py
index 1c84f2325928e9..6389f2238fc17e 100644
--- a/src/transformers/models/mctct/configuration_mctct.py
+++ b/src/transformers/models/mctct/configuration_mctct.py
@@ -144,7 +144,7 @@ def __init__(
         conv_channels=None,
         ctc_loss_reduction="sum",
         ctc_zero_infinity=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/mctct/feature_extraction_mctct.py b/src/transformers/models/mctct/feature_extraction_mctct.py
index 2f1514054ef02c..d517e3caf85e08 100644
--- a/src/transformers/models/mctct/feature_extraction_mctct.py
+++ b/src/transformers/models/mctct/feature_extraction_mctct.py
@@ -90,7 +90,7 @@ def __init__(
         normalize_means=True,
         normalize_vars=True,
         return_attention_mask=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
 
@@ -247,7 +247,7 @@ def __call__(
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index 577f7868d5aefe..db9b67090ac727 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -109,7 +109,7 @@ def __init__(
         pad_token_id=0,
         position_embedding_type="absolute",
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
index 19124a074b9af1..334b03c9335b6e 100644
--- a/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
+++ b/src/transformers/models/megatron_bert/convert_megatron_bert_checkpoint.py
@@ -184,7 +184,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 
         # For layernorm(s), simply store the layer norm.
         if op_name.endswith("layernorm"):
-
             ln_name = "attention.ln" if op_name.startswith("input") else "ln"
             output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
 
@@ -192,7 +191,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "weight":
-
             # Make sure the QKV pointer is nil.
             assert attention_qkv_weight is None, ""
 
@@ -204,7 +202,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "bias":
-
             # Make sure we read the weight tensor.
             assert attention_qkv_weight is not None, ""
 
@@ -232,7 +229,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 
         # Copy weights and biases as is.
         elif weight_or_bias in ["weight", "bias"]:
-
             out_name = megatron_to_transformers[op_name]
             output_state_dict[layer_name + out_name + weight_or_bias] = val
 
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 636130b6f5a044..5b7889adbc94a0 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -543,7 +543,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1120,7 +1119,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"]
 
@@ -1266,7 +1264,6 @@ def _reorder_cache(self, past, beam_idx):
 
 @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING)
 class MegatronBertForMaskedLM(MegatronBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"]
 
@@ -1375,7 +1372,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"predictions"]
 
     def __init__(self, config):
@@ -1401,7 +1397,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, NextSentencePredictorOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1672,7 +1668,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForTokenClassification(MegatronBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
@@ -1753,7 +1748,6 @@ def forward(
     MEGATRON_BERT_START_DOCSTRING,
 )
 class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
index f8fb1e7c8f3bcc..ccb8efb0d5d42f 100644
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@@ -475,7 +475,6 @@ def convert_checkpoint_from_megatron_to_transformers(args):
 
             # For layernorm(s), simply store the layer norm.
             if op_name.endswith("layernorm"):
-
                 ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
                 output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = params
 
@@ -483,7 +482,6 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             elif (
                 op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
             ) and weight_or_bias == "weight":
-
                 # Insert a tensor of 1x1xDxD bias.
                 causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=dtype)).view(
                     1, 1, n_positions, n_positions
@@ -510,7 +508,6 @@ def convert_checkpoint_from_megatron_to_transformers(args):
             elif (
                 op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
             ) and weight_or_bias == "bias":
-
                 out_val = megatron_to_transformers_fix_query_key_value_ordering(
                     params, checkpoint_version, 3, heads, hidden_size_per_head
                 )
@@ -519,13 +516,11 @@ def convert_checkpoint_from_megatron_to_transformers(args):
 
             # Transpose the weights.
             elif weight_or_bias == "weight":
-
                 out_name = megatron_to_transformers[op_name]
                 output_state_dict[layer_name + out_name + "weight"] = params.transpose(0, 1)
 
             # Copy the bias.
             elif weight_or_bias == "bias":
-
                 out_name = megatron_to_transformers[op_name]
                 output_state_dict[layer_name + out_name + "bias"] = params
 
diff --git a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
index 778b1384a28b36..db2774e90c87df 100644
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -179,7 +179,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
 
         # For layernorm(s), simply store the layer norm.
         if op_name.endswith("layernorm"):
-
             ln_name = "ln_1" if op_name.startswith("input") else "ln_2"
             output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val
 
@@ -187,7 +186,6 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "weight":
-
             # Insert a tensor of 1x1xDxD bias.
             causal_mask = torch.tril(torch.ones((n_positions, n_positions), dtype=torch.float16)).view(
                 1, 1, n_positions, n_positions
@@ -208,20 +206,17 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
         elif (
             op_name == "attention.query_key_value" or op_name == "self_attention.query_key_value"
         ) and weight_or_bias == "bias":
-
             out_val = fix_query_key_value_ordering(val, checkpoint_version, 3, heads, hidden_size_per_head)
             # Store. No change of shape.
             output_state_dict[layer_name + ".attn.c_attn.bias"] = out_val
 
         # Transpose the weights.
         elif weight_or_bias == "weight":
-
             out_name = megatron_to_transformers[op_name]
             output_state_dict[layer_name + out_name + "weight"] = val.transpose(0, 1)
 
         # Copy the bias.
         elif weight_or_bias == "bias":
-
             out_name = megatron_to_transformers[op_name]
             output_state_dict[layer_name + out_name + "bias"] = val
 
@@ -276,7 +271,6 @@ def main():
 
     # Read the config, or default to the model released by NVIDIA.
     if args.config_file == "":
-
         if ds_args is not None:
             if ds_args.bias_gelu_fusion:
                 activation_function = "gelu_fast"
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index fee7666a51c706..58cc9f11ab7e3e 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -23,7 +23,6 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
-
 import sentencepiece as spm
 
 from ...tokenization_utils import PreTrainedTokenizer
@@ -388,7 +387,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -534,9 +533,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -617,7 +615,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -710,7 +708,6 @@ def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spa
             )
 
         if entities is not None:
-
             if not isinstance(entities, list):
                 raise ValueError("If you specify entities, they should be given as a list")
 
@@ -729,7 +726,7 @@ def _create_input_sequence(
         entities_pair: Optional[EntityInput] = None,
         entity_spans: Optional[EntitySpanInput] = None,
         entity_spans_pair: Optional[EntitySpanInput] = None,
-        **kwargs
+        **kwargs,
     ) -> Tuple[list, list, list, list, list, list]:
         def get_input_ids(text):
             tokens = self.tokenize(text, **kwargs)
@@ -770,7 +767,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
         first_entity_token_spans, second_entity_token_spans = None, None
 
         if self.task is None:
-
             if entity_spans is None:
                 first_ids = get_input_ids(text)
             else:
@@ -853,7 +849,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
                 first_ids = first_ids[:entity_token_start] + [special_token_id] + first_ids[entity_token_start:]
 
         elif self.task == "entity_span_classification":
-
             if not (isinstance(entity_spans, list) and len(entity_spans) > 0 and isinstance(entity_spans[0], tuple)):
                 raise ValueError(
                     "Entity spans should be provided as a list of tuples, "
@@ -984,7 +979,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
diff --git a/src/transformers/models/mobilebert/configuration_mobilebert.py b/src/transformers/models/mobilebert/configuration_mobilebert.py
index 7034cdb2769b14..afe6c3b3d92798 100644
--- a/src/transformers/models/mobilebert/configuration_mobilebert.py
+++ b/src/transformers/models/mobilebert/configuration_mobilebert.py
@@ -136,7 +136,7 @@ def __init__(
         normalization_type="no_norm",
         classifier_activation=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index c74096ae4e3453..8e9ff45d6acf70 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -1035,7 +1035,6 @@ def forward(
 
 @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
 class MobileBertForMaskedLM(MobileBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [
         "cls.predictions.decoder.weight",
@@ -1349,7 +1348,6 @@ def forward(
 )
 # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing
 class MobileBertForQuestionAnswering(MobileBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
@@ -1553,7 +1551,6 @@ def forward(
 )
 # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing
 class MobileBertForTokenClassification(MobileBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index 023a5f74dabcef..0ccf9efe02cc5b 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -122,7 +122,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -170,7 +170,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
index 36ea9c61e48c6a..6bac366d237859 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
@@ -101,7 +101,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
index f13ed1b2ea3308..6e367874b760a4 100644
--- a/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
@@ -91,7 +91,7 @@ def __init__(
         classifier_dropout_prob=0.999,
         initializer_range=0.02,
         layer_norm_eps=0.001,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -110,7 +110,6 @@ def __init__(
 
 
 class MobileNetV1OnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
index c00ec9c703e067..66533cb8d034c1 100644
--- a/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v1/convert_original_tf_checkpoint_to_pytorch.py
@@ -20,11 +20,11 @@
 import re
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     MobileNetV1Config,
     MobileNetV1FeatureExtractor,
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index 9843f600bef27b..c332b96c86a8a2 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -98,7 +98,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 256}
@@ -122,7 +122,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -149,7 +149,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
@@ -194,7 +194,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
index 5c46d4c10a44e6..73003c9ded9ffc 100644
--- a/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/configuration_mobilenet_v2.py
@@ -115,7 +115,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=0.001,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -140,7 +140,6 @@ def __init__(
 
 
 class MobileNetV2OnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
index 70a00d7d23392e..8b216aecb83d8c 100644
--- a/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/mobilenet_v2/convert_original_tf_checkpoint_to_pytorch.py
@@ -20,11 +20,11 @@
 import re
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     MobileNetV2Config,
     MobileNetV2ForImageClassification,
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index 343152ebde2d73..7b24547749a16f 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -103,7 +103,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 256}
@@ -127,7 +127,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -154,7 +154,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to (size["height"], size["width"]). If the input size is smaller than `size` along any
@@ -201,7 +201,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/mobilevit/configuration_mobilevit.py b/src/transformers/models/mobilevit/configuration_mobilevit.py
index 83406c96d830c2..fe782c39821a93 100644
--- a/src/transformers/models/mobilevit/configuration_mobilevit.py
+++ b/src/transformers/models/mobilevit/configuration_mobilevit.py
@@ -136,7 +136,7 @@ def __init__(
         atrous_rates=[6, 12, 18],
         aspp_dropout_prob=0.1,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -166,7 +166,6 @@ def __init__(
 
 
 class MobileViTOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
index bc61f8822efa0f..50da675851bcb0 100644
--- a/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
+++ b/src/transformers/models/mobilevit/convert_mlcvnets_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     MobileViTConfig,
     MobileViTFeatureExtractor,
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 7cf24216fe2b7a..b600009c2eada9 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -117,7 +117,7 @@ def __init__(
         do_center_crop: bool = True,
         crop_size: Dict[str, int] = None,
         do_flip_channel_order: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -140,7 +140,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PIL.Image.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -167,7 +167,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to size `(size["height], size["width"])`. If the input size is smaller than `size` along
@@ -191,7 +191,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
diff --git a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
index e7f44b222b67b0..1b06f36536d6b9 100644
--- a/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
+++ b/src/transformers/models/mobilevit/modeling_tf_mobilevit.py
@@ -91,7 +91,7 @@ def __init__(
         dilation: int = 1,
         use_normalization: bool = True,
         use_activation: Union[bool, str] = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         logger.warning(
@@ -197,7 +197,7 @@ def __init__(
         out_channels: int,
         stride: int = 1,
         num_stages: int = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
 
@@ -383,7 +383,7 @@ def __init__(
         hidden_size: int,
         num_stages: int,
         dilation: int = 1,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.patch_width = config.patch_size
@@ -851,7 +851,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[Tuple[tf.Tensor], TFBaseModelOutputWithPooling]:
-
         output = self.mobilevit(pixel_values, output_hidden_states, return_dict, training=training)
         return output
 
diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py
index cd9515aa4c8933..6dc145fdccc542 100644
--- a/src/transformers/models/mpnet/modeling_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_mpnet.py
@@ -163,7 +163,6 @@ def forward(
         output_attentions=False,
         **kwargs,
     ):
-
         q = self.q(hidden_states)
         k = self.k(hidden_states)
         v = self.v(hidden_states)
@@ -480,7 +479,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     MPNET_START_DOCSTRING,
 )
 class MPNetModel(MPNetPreTrainedModel):
-
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def __init__(self, config, add_pooling_layer=True):
diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py
index fd944ce678129d..48866e21d4d8f8 100644
--- a/src/transformers/models/mpnet/modeling_tf_mpnet.py
+++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py
@@ -510,7 +510,6 @@ def call(
         return_dict=None,
         training=False,
     ):
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -781,7 +780,6 @@ def call(self, hidden_states):
 
 @add_start_docstrings("""MPNet Model with a `language modeling` head on top.""", MPNET_START_DOCSTRING)
 class TFMPNetForMaskedLM(TFMPNetPreTrainedModel, TFMaskedLanguageModelingLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
@@ -891,7 +889,6 @@ def call(self, features, training=False):
     MPNET_START_DOCSTRING,
 )
 class TFMPNetForSequenceClassification(TFMPNetPreTrainedModel, TFSequenceClassificationLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
@@ -1087,7 +1084,6 @@ def serving_output(self, output: TFMultipleChoiceModelOutput) -> TFMultipleChoic
     MPNET_START_DOCSTRING,
 )
 class TFMPNetForTokenClassification(TFMPNetPreTrainedModel, TFTokenClassificationLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
@@ -1169,7 +1165,6 @@ def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOu
     MPNET_START_DOCSTRING,
 )
 class TFMPNetForQuestionAnswering(TFMPNetPreTrainedModel, TFQuestionAnsweringLoss):
-
     _keys_to_ignore_on_load_missing = [r"pooler"]
 
     def __init__(self, config, *inputs, **kwargs):
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index 28d8b7096ae118..1f5ad2f41aae41 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -145,7 +145,7 @@ def __init__(
         mask_token="<mask>",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -205,7 +205,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
index f2fe4fe4fe8f0d..288c69c62b3cd5 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
@@ -124,7 +124,7 @@ def __init__(
         mask_token="<mask>",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index d9232c94629db2..168f0eb1cabefc 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -91,7 +91,7 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         decoder_start_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             is_encoder_decoder=is_encoder_decoder,
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index dac08695b6b333..07fab1ce54bb2a 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -117,7 +117,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # MT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -529,7 +528,6 @@ def forward(
         output_attentions=False,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -1755,9 +1753,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
diff --git a/src/transformers/models/mvp/configuration_mvp.py b/src/transformers/models/mvp/configuration_mvp.py
index 63a006b8e4292f..546da24954c11f 100644
--- a/src/transformers/models/mvp/configuration_mvp.py
+++ b/src/transformers/models/mvp/configuration_mvp.py
@@ -138,7 +138,7 @@ def __init__(
         use_prompt=False,
         prompt_length=100,
         prompt_mid_dim=800,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
index 8823bf17197c0c..dde522535d7a0c 100644
--- a/src/transformers/models/mvp/modeling_mvp.py
+++ b/src/transformers/models/mvp/modeling_mvp.py
@@ -1211,7 +1211,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1238,7 +1237,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1353,7 +1351,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
-
         # different to other models, Mvp automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         if decoder_input_ids is None and decoder_inputs_embeds is None:
@@ -1560,7 +1557,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/mvp/tokenization_mvp.py b/src/transformers/models/mvp/tokenization_mvp.py
index 3d5d606d63b5a6..98d373188e0535 100644
--- a/src/transformers/models/mvp/tokenization_mvp.py
+++ b/src/transformers/models/mvp/tokenization_mvp.py
@@ -180,7 +180,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py
index 00b7a5c6651e6f..28dd1ea942df6b 100644
--- a/src/transformers/models/mvp/tokenization_mvp_fast.py
+++ b/src/transformers/models/mvp/tokenization_mvp_fast.py
@@ -149,7 +149,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/nat/configuration_nat.py b/src/transformers/models/nat/configuration_nat.py
index 32272e62c56274..35f14768a25120 100644
--- a/src/transformers/models/nat/configuration_nat.py
+++ b/src/transformers/models/nat/configuration_nat.py
@@ -114,7 +114,7 @@ def __init__(
         layer_norm_eps=1e-5,
         layer_scale_init_value=0.0,
         out_features=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py
index c2e445b7ae994b..d455d9e5ee2b5b 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -329,7 +329,6 @@ def forward(
         hidden_states: torch.Tensor,
         output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.Tensor]:
-
         query_layer = self.transpose_for_scores(self.query(hidden_states))
         key_layer = self.transpose_for_scores(self.key(hidden_states))
         value_layer = self.transpose_for_scores(self.value(hidden_states))
diff --git a/src/transformers/models/nezha/configuration_nezha.py b/src/transformers/models/nezha/configuration_nezha.py
index 7c07a0d9bc2b2d..8d191b7d96ec60 100644
--- a/src/transformers/models/nezha/configuration_nezha.py
+++ b/src/transformers/models/nezha/configuration_nezha.py
@@ -86,7 +86,7 @@ def __init__(
         bos_token_id=2,
         eos_token_id=3,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py
index b8bf1ac16487fa..98712f0aced535 100644
--- a/src/transformers/models/nezha/modeling_nezha.py
+++ b/src/transformers/models/nezha/modeling_nezha.py
@@ -570,7 +570,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1138,7 +1137,6 @@ def forward(
 
 @add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
 class NezhaForMaskedLM(NezhaPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"]
 
@@ -1540,7 +1538,6 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForTokenClassification(NezhaPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
@@ -1622,7 +1619,6 @@ def forward(
     NEZHA_START_DOCSTRING,
 )
 class NezhaForQuestionAnswering(NezhaPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py
index 6a326fd3ca10b7..ac2aa2380b1b59 100644
--- a/src/transformers/models/nllb/tokenization_nllb.py
+++ b/src/transformers/models/nllb/tokenization_nllb.py
@@ -140,9 +140,8 @@ def __init__(
         tgt_lang=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
-
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py
index 1afe27f43b4e53..7c6979e295b956 100644
--- a/src/transformers/models/nllb/tokenization_nllb_fast.py
+++ b/src/transformers/models/nllb/tokenization_nllb_fast.py
@@ -151,7 +151,7 @@ def __init__(
         src_lang=None,
         tgt_lang=None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/nystromformer/configuration_nystromformer.py b/src/transformers/models/nystromformer/configuration_nystromformer.py
index 9a1cf726ea73c8..98b3e511ac0e21 100644
--- a/src/transformers/models/nystromformer/configuration_nystromformer.py
+++ b/src/transformers/models/nystromformer/configuration_nystromformer.py
@@ -112,7 +112,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
index d8e2cfac10190b..8d5a52bdbf82da 100644
--- a/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/nystromformer/convert_nystromformer_original_pytorch_checkpoint_to_pytorch.py
@@ -78,7 +78,6 @@ def convert_checkpoint_helper(config, orig_state_dict):
 
 
 def convert_nystromformer_checkpoint(checkpoint_path, nystromformer_config_file, pytorch_dump_path):
-
     orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
     config = NystromformerConfig.from_json_file(nystromformer_config_file)
     model = NystromformerForMaskedLM(config)
diff --git a/src/transformers/models/oneformer/convert_to_hf_oneformer.py b/src/transformers/models/oneformer/convert_to_hf_oneformer.py
index 074cc659b2688f..bfe2aee5e22270 100644
--- a/src/transformers/models/oneformer/convert_to_hf_oneformer.py
+++ b/src/transformers/models/oneformer/convert_to_hf_oneformer.py
@@ -23,13 +23,12 @@
 from pprint import pformat
 from typing import Any, Dict, Iterator, List, Set, Tuple
 
+import requests
 import torch
 import torchvision.transforms as T
 from PIL import Image
 from torch import Tensor, nn
 
-import requests
-
 
 try:
     from detectron2.checkpoint import DetectionCheckpointer
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index af36dbe0ab94df..b1e93c9e393811 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -19,8 +19,8 @@
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from transformers.image_transforms import (
     PaddingMode,
@@ -403,7 +403,7 @@ def __init__(
         repo_path: str = "shi-labs/oneformer_demo",
         class_info_file: str = None,
         num_text: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ):
         if "max_size" in kwargs:
             self._max_size = kwargs.pop("max_size")
@@ -443,7 +443,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format=None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
@@ -507,7 +507,7 @@ def convert_segmentation_map_to_binary_masks(
         instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ):
         reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
         ignore_index = ignore_index if ignore_index is not None else self.ignore_index
@@ -619,7 +619,7 @@ def preprocess(
         do_reduce_labels: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         if "pad_and_return_pixel_mask" in kwargs:
             warnings.warn(
@@ -881,7 +881,7 @@ def encode_inputs(
         ignore_index: Optional[int] = None,
         reduce_labels: bool = False,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py
index 2dd04c0a4303f3..84539b83d96db7 100644
--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@@ -2507,7 +2507,7 @@ def __init__(
         visual_dim=1024,
         dropout=0.1,
         layer_norm_eps=1e-05,
-        **kwargs
+        **kwargs,
     ):
         super().__init__()
 
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index b08c72daf0492c..df16be211c5d22 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -136,7 +136,7 @@ def __init__(
         summary_activation=None,
         summary_proj_to_labels=True,
         summary_first_dropout=0.1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.n_positions = n_positions
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index fbe63a001e3856..4bd4f506e92085 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -248,7 +248,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFBaseModelOutput]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
@@ -544,7 +543,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
     ) -> Union[Tuple, TFBaseModelOutput]:
-
         outputs = self.transformer(
             input_ids=input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/opt/configuration_opt.py b/src/transformers/models/opt/configuration_opt.py
index f8b5bc4d8faf9f..df13b32019984e 100644
--- a/src/transformers/models/opt/configuration_opt.py
+++ b/src/transformers/models/opt/configuration_opt.py
@@ -118,7 +118,7 @@ def __init__(
         eos_token_id=2,
         enable_bias=True,
         layer_norm_elementwise_affine=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             pad_token_id=pad_token_id,
diff --git a/src/transformers/models/opt/modeling_flax_opt.py b/src/transformers/models/opt/modeling_flax_opt.py
index db9154b33b277b..b7038008f5d4d6 100644
--- a/src/transformers/models/opt/modeling_flax_opt.py
+++ b/src/transformers/models/opt/modeling_flax_opt.py
@@ -309,7 +309,6 @@ def __call__(
         output_attentions: bool = True,
         deterministic: bool = True,
     ) -> Tuple[jnp.ndarray]:
-
         residual = hidden_states
 
         # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
@@ -527,7 +526,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -666,7 +665,6 @@ def __call__(
         deterministic: bool = True,
         init_cache=False,
     ):
-
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -726,7 +724,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         outputs = self.model(
             input_ids,
             attention_mask,
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index d98b2ff058ee9f..5269c67de79c9e 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -391,7 +391,6 @@ def forward(
     OPT_START_DOCSTRING,
 )
 class OPTPreTrainedModel(PreTrainedModel):
-
     config_class = OPTConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -671,7 +670,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -693,7 +691,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -771,7 +768,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/opt/modeling_tf_opt.py b/src/transformers/models/opt/modeling_tf_opt.py
index c9f4c087b387a6..2fcbd444a1d141 100644
--- a/src/transformers/models/opt/modeling_tf_opt.py
+++ b/src/transformers/models/opt/modeling_tf_opt.py
@@ -742,9 +742,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -815,9 +814,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFBaseModelOutputWithPast, Tuple[tf.Tensor]]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -915,7 +913,7 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFCausalLMOutputWithPast, Tuple[tf.Tensor]]:
         r"""
         Args:
diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py
index d84cf64937ddfb..978af9dc3d33d0 100644
--- a/src/transformers/models/owlvit/configuration_owlvit.py
+++ b/src/transformers/models/owlvit/configuration_owlvit.py
@@ -109,7 +109,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=49406,
         eos_token_id=49407,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -127,7 +127,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from OwlViTConfig
@@ -212,7 +211,7 @@ def __init__(
         attention_dropout=0.0,
         initializer_range=0.02,
         initializer_factor=1.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -231,7 +230,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from OwlViTConfig
@@ -281,7 +279,7 @@ def __init__(
         projection_dim=512,
         logit_scale_init_value=2.6592,
         return_dict=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -375,7 +373,6 @@ def generate_dummy_inputs(
         seq_length: int = -1,
         framework: Optional["TensorType"] = None,
     ) -> Mapping[str, Any]:
-
         text_input_dict = super().generate_dummy_inputs(
             processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
         )
diff --git a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
index 09942fa3928d0b..d2ea6b0a6c516d 100644
--- a/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
+++ b/src/transformers/models/owlvit/convert_owlvit_original_flax_to_hf.py
@@ -18,14 +18,14 @@
 import argparse
 import collections
 
-import torch
-import torch.nn as nn
-
 import jax
 import jax.numpy as jnp
+import torch
+import torch.nn as nn
 from clip.model import CLIP
 from flax.training import checkpoints
 from huggingface_hub import Repository
+
 from transformers import (
     CLIPTokenizer,
     OwlViTConfig,
@@ -314,7 +314,6 @@ def convert_clip_backbone(flax_params, torch_config):
     # Copy flax CLIP backbone params to PyTorch params
     for name, param in new_torch_params.items():
         if name in torch_clip_params.keys():
-
             new_param = torch.from_numpy(new_torch_params[name])
             torch_clip_params[name].copy_(new_param)
         else:
diff --git a/src/transformers/models/owlvit/image_processing_owlvit.py b/src/transformers/models/owlvit/image_processing_owlvit.py
index 650a2d787e636e..ee5d230ea3c341 100644
--- a/src/transformers/models/owlvit/image_processing_owlvit.py
+++ b/src/transformers/models/owlvit/image_processing_owlvit.py
@@ -136,7 +136,7 @@ def __init__(
         do_normalize=True,
         image_mean=None,
         image_std=None,
-        **kwargs
+        **kwargs,
     ):
         size = size if size is not None else {"height": 768, "width": 768}
         size = get_size_dict(size, default_to_square=True)
@@ -169,7 +169,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to a certain size.
@@ -185,7 +185,7 @@ def center_crop(
         image: np.ndarray,
         crop_size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to a certain size.
@@ -201,7 +201,7 @@ def rescale(
         image: np.ndarray,
         rescale_factor: float,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Rescale an image by a certain factor.
@@ -214,7 +214,7 @@ def normalize(
         mean: List[float],
         std: List[float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image with a certain mean and standard deviation.
@@ -236,7 +236,7 @@ def preprocess(
         image_std: Optional[Union[float, List[float]]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Prepares an image or batch of images for the model.
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 0a7599639bf38d..1a0f3ed0f6d817 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1276,7 +1276,6 @@ def forward(
         query_embeds: Optional[torch.FloatTensor],
         query_mask: Optional[torch.Tensor],
     ) -> Tuple[torch.FloatTensor]:
-
         image_class_embeds = self.dense0(image_embeds)
         if query_embeds is None:
             device = image_class_embeds.device
@@ -1408,7 +1407,6 @@ def image_text_embedder(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
     ) -> Tuple[torch.FloatTensor]:
-
         # Encode text and image
         outputs = self.owlvit(
             pixel_values=pixel_values,
@@ -1478,7 +1476,6 @@ def image_embedder(
     def embed_image_query(
         self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor
     ) -> torch.FloatTensor:
-
         _, class_embeds = self.class_predictor(query_image_features)
         pred_boxes = self.box_predictor(query_image_features, query_feature_map)
         pred_boxes_as_corners = center_to_corners_format(pred_boxes)
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index f38d61ff8a027f..fd7de9a1a490b9 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -126,7 +126,7 @@ def __init__(
         pad_token_id=0,
         eos_token_id=1,
         forced_eos_token_id=1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
index 9254a0ba941100..739e075233f764 100644
--- a/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
+++ b/src/transformers/models/pegasus/convert_pegasus_tf_to_pytorch.py
@@ -46,7 +46,6 @@
 
 
 def rename_state_dict_key(k):
-
     for pegasus_name, hf_name in PATTERNS:
         k = k.replace(pegasus_name, hf_name)
     return k
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index 75d38d59ef2214..b39e2c437e537e 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -20,11 +20,10 @@
 from functools import partial
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -903,7 +902,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1458,7 +1457,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 18e17ab5732de7..d4cc8f381e1437 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1069,7 +1069,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1094,7 +1093,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1458,7 +1456,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/pegasus/modeling_tf_pegasus.py b/src/transformers/models/pegasus/modeling_tf_pegasus.py
index a8c27b6497b909..f38dbde5f37afb 100644
--- a/src/transformers/models/pegasus/modeling_tf_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -821,7 +821,6 @@ def call(
 
         # encoder layers
         for idx, encoder_layer in enumerate(self.layers):
-
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
@@ -1142,9 +1141,8 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
-
         if decoder_input_ids is None and decoder_inputs_embeds is None:
             use_cache = False
 
@@ -1248,9 +1246,8 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs
+        **kwargs,
     ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -1471,9 +1468,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             decoder_input_ids = decoder_input_ids[:, -1:]
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
index 77127125bb483b..814602fac88d0d 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -113,7 +113,7 @@ def __init__(
         additional_special_tokens=None,
         offset=103,  # entries 2 - 104 are only used for pretraining
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.offset = offset
         if additional_special_tokens is not None:
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
index 22c6018385f6d0..d0345fe60cab91 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -108,7 +108,7 @@ def __init__(
         mask_token_sent="<mask_1>",
         additional_special_tokens=None,
         offset=103,  # entries 2 - 104 are only used for pretraining
-        **kwargs
+        **kwargs,
     ):
         self.offset = offset
 
diff --git a/src/transformers/models/pegasus_x/configuration_pegasus_x.py b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
index 1fc0bfa837454d..c393a6b8a91044 100644
--- a/src/transformers/models/pegasus_x/configuration_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/configuration_pegasus_x.py
@@ -135,7 +135,7 @@ def __init__(
         num_global_tokens=32,
         block_size=512,
         stagger_local_blocks=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
index e33cf1cd3fdee1..f09a0447fd75fb 100755
--- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py
+++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py
@@ -1310,7 +1310,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1333,7 +1332,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1664,7 +1662,7 @@ def prepare_inputs_for_generation(
         attention_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py
index a4b475532ff87a..9a7c3457882049 100644
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -145,7 +145,7 @@ def __init__(
         audio_samples_per_frame=1920,
         samples_per_patch=16,
         output_shape=[1, 16, 224, 224],
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
index d1a4fd14e57602..9c925313a343d1 100644
--- a/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
+++ b/src/transformers/models/perceiver/convert_perceiver_haiku_to_pytorch.py
@@ -20,13 +20,13 @@
 import pickle
 from pathlib import Path
 
+import haiku as hk
 import numpy as np
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import haiku as hk
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     PerceiverConfig,
     PerceiverFeatureExtractor,
diff --git a/src/transformers/models/perceiver/image_processing_perceiver.py b/src/transformers/models/perceiver/image_processing_perceiver.py
index 00bf238865569e..59b7fd5332bf6f 100644
--- a/src/transformers/models/perceiver/image_processing_perceiver.py
+++ b/src/transformers/models/perceiver/image_processing_perceiver.py
@@ -95,7 +95,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
@@ -120,7 +120,7 @@ def center_crop(
         crop_size: Dict[str, int],
         size: Optional[int] = None,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to `(size["height"] / crop_size["height"] * min_dim, size["width"] / crop_size["width"] *
@@ -155,7 +155,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PIL.Image.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -182,7 +182,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -203,7 +203,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 00d5bcb01ba2dc..9a6b6bad5f11db 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -2442,7 +2442,7 @@ def __init__(
         output_num_channels: int,
         min_padding_size: Optional[int] = 2,
         subsampled_index_dims: Optional[Dict[str, PerceiverAbstractDecoder]] = None,
-        **decoder_kwargs
+        **decoder_kwargs,
     ) -> None:
         super().__init__()
         self.modalities = nn.ModuleDict(modalities)
diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py
index 958d8a9c1d6115..cbfd9e64150243 100644
--- a/src/transformers/models/perceiver/tokenization_perceiver.py
+++ b/src/transformers/models/perceiver/tokenization_perceiver.py
@@ -66,9 +66,8 @@ def __init__(
         cls_token="[CLS]",
         sep_token="[SEP]",
         model_max_length=2048,
-        **kwargs
+        **kwargs,
     ) -> None:
-
         pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index a37a5645ae424e..dd294ac43a68d5 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -129,7 +129,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             bos_token=bos_token,
diff --git a/src/transformers/models/plbart/configuration_plbart.py b/src/transformers/models/plbart/configuration_plbart.py
index e3d6c7fbe9fbf1..25f4c31c577859 100644
--- a/src/transformers/models/plbart/configuration_plbart.py
+++ b/src/transformers/models/plbart/configuration_plbart.py
@@ -132,7 +132,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         forced_eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index d0828941ffe75e..ed506a69356f8c 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -1047,7 +1047,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1072,7 +1071,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1371,7 +1369,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        **kwargs  # TODO: Check if this is needed. It is unused?
+        **kwargs,  # TODO: Check if this is needed. It is unused?
     ) -> Dict[str, Any]:
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py
index 94ec77c468c971..bf47538eaabdaf 100644
--- a/src/transformers/models/plbart/tokenization_plbart.py
+++ b/src/transformers/models/plbart/tokenization_plbart.py
@@ -189,7 +189,7 @@ def __init__(
         tgt_lang=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/poolformer/configuration_poolformer.py b/src/transformers/models/poolformer/configuration_poolformer.py
index c55f13b80c9623..550c387adcefe8 100644
--- a/src/transformers/models/poolformer/configuration_poolformer.py
+++ b/src/transformers/models/poolformer/configuration_poolformer.py
@@ -111,7 +111,7 @@ def __init__(
         use_layer_scale=True,
         layer_scale_init_value=1e-5,
         initializer_range=0.02,
-        **kwargs
+        **kwargs,
     ):
         self.num_channels = num_channels
         self.patch_size = patch_size
@@ -133,7 +133,6 @@ def __init__(
 
 
 class PoolFormerOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
index 4ab0d2bfb3d457..d00a9970aaae66 100644
--- a/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
+++ b/src/transformers/models/poolformer/convert_poolformer_original_to_pytorch.py
@@ -19,11 +19,11 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import PoolFormerConfig, PoolFormerFeatureExtractor, PoolFormerForImageClassification
 from transformers.utils import logging
 
diff --git a/src/transformers/models/poolformer/image_processing_poolformer.py b/src/transformers/models/poolformer/image_processing_poolformer.py
index d78bf30327b80e..2548c03da469d2 100644
--- a/src/transformers/models/poolformer/image_processing_poolformer.py
+++ b/src/transformers/models/poolformer/image_processing_poolformer.py
@@ -116,7 +116,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -143,7 +143,7 @@ def resize(
         crop_pct: Optional[float] = None,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -203,7 +203,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to (size["height"], size["width"]). If the input size is smaller than `crop_size` along
@@ -227,7 +227,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -248,7 +248,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/prophetnet/configuration_prophetnet.py b/src/transformers/models/prophetnet/configuration_prophetnet.py
index 40f5939d99bc7d..35988eaa132128 100644
--- a/src/transformers/models/prophetnet/configuration_prophetnet.py
+++ b/src/transformers/models/prophetnet/configuration_prophetnet.py
@@ -132,7 +132,7 @@ def __init__(
         pad_token_id: Optional[int] = 0,
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
index 638a71ef2fa423..c9e64c06ef769a 100644
--- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py
@@ -19,8 +19,6 @@
 
 from torch import nn
 
-from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
-
 # transformers_old should correspond to branch `save_old_prophetnet_model_structure` here
 # original prophetnet_checkpoints are saved under `patrickvonplaten/..._old` respectively
 from transformers_old.modeling_prophetnet import (
@@ -30,6 +28,8 @@
     XLMProphetNetForConditionalGeneration as XLMProphetNetForConditionalGenerationOld,
 )
 
+from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging
+
 
 logger = logging.get_logger(__name__)
 logging.set_verbosity_info()
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index baf7e1dc4ebb4c..7dbef7c6e29245 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -666,7 +666,6 @@ def forward(
         past_key_value: Optional[Tuple[Tensor]] = None,
         output_attentions: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
-
         batch_size, tgt_len, hidden_size = hidden_states.size()
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1589,7 +1588,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 05e03ad4881a0a..36104d49fb2d8b 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -340,7 +340,7 @@ def __init__(
         mask_token: Optional[str] = "[MASK]",
         tokenize_chinese_chars: Optional[bool] = True,
         strip_accents: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -385,7 +385,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/qdqbert/configuration_qdqbert.py
index 090617a6308f95..c4f8c1559e61da 100644
--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/qdqbert/configuration_qdqbert.py
@@ -105,7 +105,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index 40e581d4f6da12..ba24b897234363 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -574,7 +574,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1013,7 +1012,6 @@ def forward(
     """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
 )
 class QDQBertLMHeadModel(QDQBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
@@ -1145,7 +1143,7 @@ def prepare_inputs_for_generation(
         input_ids: Optional[torch.LongTensor],
         past_key_values=None,
         attention_mask: Optional[torch.Tensor] = None,
-        **model_kwargs
+        **model_kwargs,
     ):
         input_shape = input_ids.shape
         # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
@@ -1167,7 +1165,6 @@ def _reorder_cache(self, past, beam_idx):
 
 @add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
 class QDQBertForMaskedLM(QDQBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
@@ -1571,7 +1568,6 @@ def forward(
     QDQBERT_START_DOCSTRING,
 )
 class QDQBertForTokenClassification(QDQBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
@@ -1652,7 +1648,6 @@ def forward(
     QDQBERT_START_DOCSTRING,
 )
 class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index 109588eadbdfaa..59d2951deffe1c 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -112,7 +112,7 @@ def __init__(
         output_retrieved=False,
         use_cache=True,
         forced_eos_token_id=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             bos_token_id=bos_token_id,
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 52aca10d5c03ed..df6b56d05c6eee 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -246,7 +246,7 @@ def from_pretrained_question_encoder_generator(
         question_encoder_pretrained_model_name_or_path: str = None,
         generator_pretrained_model_name_or_path: str = None,
         retriever: RagRetriever = None,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
@@ -588,7 +588,6 @@ def forward(
         )
         # encoder_outputs are pre-computed during RAG-token generation
         if encoder_outputs is None:
-
             if has_to_retrieve:
                 question_enc_outputs = self.question_encoder(
                     input_ids, attention_mask=attention_mask, return_dict=True
@@ -603,7 +602,6 @@ def forward(
                     return_tensors="pt",
                 )
                 if self.context_encoder_training:
-
                     (
                         context_input_ids,
                         context_attention_mask,
@@ -789,7 +787,7 @@ def forward(
         reduce_loss: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
         n_docs: Optional[int] = None,
-        **kwargs  # needs kwargs for generation
+        **kwargs,  # needs kwargs for generation
     ) -> RetrievAugLMMarginOutput:
         r"""
         exclude_bos_score (`bool`, *optional*):
@@ -921,7 +919,7 @@ def generate(
         num_return_sequences: Optional[int] = None,  # defaults to 1
         num_beams: Optional[int] = None,  # defaults to 1
         n_docs: Optional[int] = None,
-        **model_kwargs
+        **model_kwargs,
     ) -> torch.LongTensor:
         """
         Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
@@ -1176,7 +1174,7 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         doc_scores=None,
         n_docs=None,
-        **kwargs
+        **kwargs,
     ):
         if past_key_values is not None:
             # if past is defined use only last decoder_input_ids
@@ -1225,7 +1223,6 @@ def _reorder_stacked(hidden_states, new_order):
         return reordered_past
 
     def marginalize(self, seq_logits, doc_scores, n_docs=None):
-
         n_docs = n_docs if n_docs is not None else self.config.n_docs
 
         # RAG-token marginalization
@@ -1257,7 +1254,7 @@ def forward(
         reduce_loss: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
         n_docs: Optional[int] = None,
-        **kwargs  # needs kwargs for generation
+        **kwargs,  # needs kwargs for generation
     ) -> RetrievAugLMMarginOutput:
         r"""
         do_marginalize (`bool`, *optional*):
@@ -1390,7 +1387,7 @@ def generate(
         prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]] = None,
         logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
         stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(),
-        **kwargs
+        **kwargs,
     ) -> torch.LongTensor:
         """
         Implements RAG token decoding.
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index cda15a8b4454ae..0ea2e554489b6f 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -232,7 +232,7 @@ def from_pretrained_question_encoder_generator(
         generator_pretrained_model_name_or_path: str = None,
         retriever: RagRetriever = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> TFPreTrainedModel:
         r"""
         Instantiates an question encoder and a generator from one or two base classes of the library from pretrained
@@ -491,7 +491,6 @@ def from_pretrained_question_encoder_generator(
 
 @add_start_docstrings_to_model_forward(RAG_START_DOCSTRING)
 class TFRagModel(TFRagPreTrainedModel):
-
     load_weight_prefix = "tf_rag_model_1"
 
     def __init__(
@@ -562,7 +561,7 @@ def call(
         n_docs: Optional[int] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs
+        **kwargs,
     ):
         r"""
         Returns:
@@ -602,7 +601,6 @@ def call(
 
         # encoder_outputs are pre-computed during RAG-token generation
         if encoder_outputs is None:
-
             if has_to_retrieve:
                 question_enc_outputs = self.question_encoder(
                     input_ids, attention_mask=attention_mask, return_dict=True, training=training
@@ -726,7 +724,6 @@ def call(
     RAG_START_DOCSTRING,
 )
 class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss):
-
     load_weight_prefix = "tf_rag_token_for_generation_1/rag"
 
     def __init__(
@@ -771,7 +768,7 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         doc_scores=None,
         n_docs=None,
-        **kwargs
+        **kwargs,
     ):
         if past_key_values is not None:
             # if past is defined use only last decoder_input_ids
@@ -863,7 +860,7 @@ def call(
         reduce_loss: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs  # needs kwargs for generation
+        **kwargs,  # needs kwargs for generation
     ):
         r"""
         do_marginalize (`bool`, *optional*):
@@ -1004,7 +1001,7 @@ def generate(
         n_docs=None,
         generation_config=None,
         logits_processor=TFLogitsProcessorList(),
-        **kwargs
+        **kwargs,
     ):
         """
         Implements TFRAG token decoding.
@@ -1299,7 +1296,6 @@ def hf_compute_loss(self, labels, y_pred, smooth_epsilon=0.0, from_logits=True,
     RAG_START_DOCSTRING,
 )
 class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss):
-
     load_weight_prefix = "tf_rag_sequence_for_generation_1/rag"
 
     def __init__(
@@ -1370,7 +1366,7 @@ def call(
         reduce_loss: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs  # needs kwargs for generation
+        **kwargs,  # needs kwargs for generation
     ) -> Union[Tuple[tf.Tensor], TFRetrievAugLMMarginOutput]:
         r"""
         exclude_bos_score (`bool`, *optional*):
@@ -1592,7 +1588,7 @@ def generate(
         num_return_sequences=None,  # defaults to 1
         num_beams=None,  # defaults to 1
         n_docs=None,
-        **model_kwargs
+        **model_kwargs,
     ):
         """
         Implements RAG sequence "thorough" decoding. Read the [`~generation.GenerationMixin.generate`]` documentation
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 5f15058e5f83bd..bef2baf05f202d 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -153,7 +153,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 1231cea1b9b66b..a7423714944c4c 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -577,7 +577,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1526,7 +1525,6 @@ def forward(
 
 @add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
 class RealmReader(RealmPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index db6c8c7246be31..4bdf19454f0815 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -18,8 +18,8 @@
 from typing import Optional, Union
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers import AutoTokenizer
 
 from ...utils import logging
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 28ee33ab87bc07..de067b0594cad8 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -155,7 +155,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -203,7 +203,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
index f61fa8418ed2ba..4db8b165b96300 100644
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/realm/tokenization_realm_fast.py
@@ -160,7 +160,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py
index d4ffb2a39b055d..af712ced1eed0e 100755
--- a/src/transformers/models/reformer/configuration_reformer.py
+++ b/src/transformers/models/reformer/configuration_reformer.py
@@ -197,7 +197,7 @@ def __init__(
         tie_word_embeddings=False,
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         self.hash_seed = hash_seed
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index af76a55b1feb01..a9ca155bb1b14e 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -1226,7 +1226,6 @@ def forward(
     def _compute_attn_mask(
         self, query_indices, key_indices, attention_mask, query_key_dots_shape, do_standard_self_attention
     ):
-
         # chunk attention mask and look before and after
         if attention_mask is not None:
             attention_mask = attention_mask.to(torch.uint8)[:, None, :]
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index 814d5ed6cde11a..8796c8149c8ae6 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -102,7 +102,7 @@ def __init__(
         unk_token="<unk>",
         additional_special_tokens=[],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py
index e9c6a61993d09a..4fae5943d72108 100644
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -98,7 +98,7 @@ def __init__(
         eos_token="</s>",
         unk_token="<unk>",
         additional_special_tokens=[],
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/regnet/configuration_regnet.py b/src/transformers/models/regnet/configuration_regnet.py
index 13b61c235be06c..201354d1553c34 100644
--- a/src/transformers/models/regnet/configuration_regnet.py
+++ b/src/transformers/models/regnet/configuration_regnet.py
@@ -78,7 +78,7 @@ def __init__(
         groups_width=64,
         layer_type="y",
         hidden_act="relu",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
diff --git a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
index 4a73b9623f113c..22a8a99ca20b03 100644
--- a/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py
@@ -29,14 +29,14 @@
 
 import torch
 import torch.nn as nn
-from torch import Tensor
-
 from classy_vision.models.regnet import RegNet, RegNetParams
 from huggingface_hub import cached_download, hf_hub_url
+from torch import Tensor
+from vissl.models.model_helpers import get_trunk_forward_outputs
+
 from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
-from vissl.models.model_helpers import get_trunk_forward_outputs
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/regnet/convert_regnet_to_pytorch.py b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
index acb74dc89dce93..6b34c6aa19c104 100644
--- a/src/transformers/models/regnet/convert_regnet_to_pytorch.py
+++ b/src/transformers/models/regnet/convert_regnet_to_pytorch.py
@@ -22,16 +22,16 @@
 from pathlib import Path
 from typing import Callable, Dict, List, Tuple
 
+import timm
 import torch
 import torch.nn as nn
-from torch import Tensor
-
-import timm
 from classy_vision.models.regnet import RegNet, RegNetParams, RegNetY32gf, RegNetY64gf, RegNetY128gf
 from huggingface_hub import cached_download, hf_hub_url
+from torch import Tensor
+from vissl.models.model_helpers import get_trunk_forward_outputs
+
 from transformers import AutoFeatureExtractor, RegNetConfig, RegNetForImageClassification, RegNetModel
 from transformers.utils import logging
-from vissl.models.model_helpers import get_trunk_forward_outputs
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py
index 550722569178e9..792a6dbcfadfe7 100644
--- a/src/transformers/models/rembert/configuration_rembert.py
+++ b/src/transformers/models/rembert/configuration_rembert.py
@@ -119,7 +119,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=312,
         eos_token_id=313,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index c94c3a491eefec..9a65cb97ab56bf 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -521,7 +521,6 @@ def forward(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-
         hidden_states = self.embedding_hidden_mapping_in(hidden_states)
         all_hidden_states = () if output_hidden_states else None
         all_self_attentions = () if output_attentions else None
@@ -536,7 +535,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1015,7 +1013,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_
     """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING
 )
 class RemBertForCausalLM(RemBertPreTrainedModel):
-
     _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/rembert/modeling_tf_rembert.py b/src/transformers/models/rembert/modeling_tf_rembert.py
index 46bcfa3458a792..74683dfc0c0884 100644
--- a/src/transformers/models/rembert/modeling_tf_rembert.py
+++ b/src/transformers/models/rembert/modeling_tf_rembert.py
@@ -669,7 +669,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py
index 4c2cce94aa3ad6..cff101451b4a97 100644
--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -109,7 +109,7 @@ def __init__(
         pad_token="[PAD]",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
diff --git a/src/transformers/models/rembert/tokenization_rembert_fast.py b/src/transformers/models/rembert/tokenization_rembert_fast.py
index 72c402438f50f2..5d5032f4112838 100644
--- a/src/transformers/models/rembert/tokenization_rembert_fast.py
+++ b/src/transformers/models/rembert/tokenization_rembert_fast.py
@@ -114,7 +114,7 @@ def __init__(
         pad_token="<pad>",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/resnet/configuration_resnet.py b/src/transformers/models/resnet/configuration_resnet.py
index 74f6c693972298..bf1855340b074f 100644
--- a/src/transformers/models/resnet/configuration_resnet.py
+++ b/src/transformers/models/resnet/configuration_resnet.py
@@ -89,7 +89,7 @@ def __init__(
         hidden_act="relu",
         downsample_in_first_stage=False,
         out_features=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if layer_type not in self.layer_types:
@@ -114,7 +114,6 @@ def __init__(
 
 
 class ResNetOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/resnet/convert_resnet_to_pytorch.py b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
index ef3d564185df8c..5f836c9d2a05c7 100644
--- a/src/transformers/models/resnet/convert_resnet_to_pytorch.py
+++ b/src/transformers/models/resnet/convert_resnet_to_pytorch.py
@@ -22,12 +22,12 @@
 from pathlib import Path
 from typing import List
 
+import timm
 import torch
 import torch.nn as nn
+from huggingface_hub import hf_hub_download
 from torch import Tensor
 
-import timm
-from huggingface_hub import hf_hub_download
 from transformers import AutoFeatureExtractor, ResNetConfig, ResNetForImageClassification
 from transformers.utils import logging
 
diff --git a/src/transformers/models/resnet/modeling_tf_resnet.py b/src/transformers/models/resnet/modeling_tf_resnet.py
index 9f02b46f8baffc..bb6035adf2df64 100644
--- a/src/transformers/models/resnet/modeling_tf_resnet.py
+++ b/src/transformers/models/resnet/modeling_tf_resnet.py
@@ -171,7 +171,7 @@ def __init__(
         stride: int = 1,
         activation: str = "relu",
         reduction: int = 4,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         should_apply_shortcut = in_channels != out_channels or stride != 1
diff --git a/src/transformers/models/retribert/configuration_retribert.py b/src/transformers/models/retribert/configuration_retribert.py
index 23172cf40ec7d3..33663ad6167f9f 100644
--- a/src/transformers/models/retribert/configuration_retribert.py
+++ b/src/transformers/models/retribert/configuration_retribert.py
@@ -91,7 +91,7 @@ def __init__(
         share_encoders=True,
         projection_dim=128,
         pad_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/retribert/tokenization_retribert.py
index a15cbcdf74bfda..0c04c363ebe0a0 100644
--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/retribert/tokenization_retribert.py
@@ -68,7 +68,6 @@ def whitespace_tokenize(text):
 
 
 class RetriBertTokenizer(PreTrainedTokenizer):
-
     r"""
     Constructs a RetriBERT tokenizer.
 
@@ -131,7 +130,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -183,7 +182,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/retribert/tokenization_retribert_fast.py b/src/transformers/models/retribert/tokenization_retribert_fast.py
index 2532f839a30090..c242213e1faedb 100644
--- a/src/transformers/models/retribert/tokenization_retribert_fast.py
+++ b/src/transformers/models/retribert/tokenization_retribert_fast.py
@@ -114,7 +114,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py
index 3d911f5847dc76..3025fe2833d904 100644
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -123,7 +123,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/roberta/modeling_flax_roberta.py b/src/transformers/models/roberta/modeling_flax_roberta.py
index bfb6ea365adcf6..5de7375d383843 100644
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -14,11 +14,10 @@
 # limitations under the License.
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -739,7 +738,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index 884773e641de1a..2b6d47b42036c4 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -501,7 +501,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index c38de45e5629e9..606afb754b4ffe 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -633,7 +633,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py
index 10b28125e92bce..d291a2f9d97a08 100644
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -198,7 +198,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         add_prefix_space=False,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py
index 29381404c47fe9..49311b3aeff9ab 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -173,7 +173,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         trim_offsets=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
index 1683e527aa8776..49f92586c1b732 100644
--- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
@@ -124,7 +124,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
index 463f8d58a61eaf..41fd14c5fddff2 100644
--- a/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/roberta_prelayernorm/convert_roberta_prelayernorm_original_pytorch_checkpoint_to_pytorch.py
@@ -18,8 +18,8 @@
 import argparse
 
 import torch
-
 from huggingface_hub import hf_hub_download
+
 from transformers import AutoTokenizer, RobertaPreLayerNormConfig, RobertaPreLayerNormForMaskedLM
 from transformers.utils import logging
 
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
index dc9ef03afdc7e8..8f5dc7944c1614 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
@@ -15,11 +15,10 @@
 """ Flax RoBERTa-PreLayerNorm model."""
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -742,7 +741,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
index 037f40fb6d7d78..e5fbb6e3413c1b 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -503,7 +503,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
index d5808a82542b19..1843605bd04a26 100644
--- a/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
+++ b/src/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -634,7 +634,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py
index 63f3dbc49eacf0..2f0a0dd0e0f7ac 100644
--- a/src/transformers/models/roc_bert/configuration_roc_bert.py
+++ b/src/transformers/models/roc_bert/configuration_roc_bert.py
@@ -136,7 +136,7 @@ def __init__(
         shape_embed_dim=512,
         shape_vocab_size=24858,
         concat_input=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py
index 11b72170c44981..ac3d374a556c7a 100644
--- a/src/transformers/models/roc_bert/modeling_roc_bert.py
+++ b/src/transformers/models/roc_bert/modeling_roc_bert.py
@@ -635,7 +635,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1551,7 +1550,7 @@ def prepare_inputs_for_generation(
         input_pronunciation_ids=None,
         past_key_values=None,
         attention_mask=None,
-        **model_kwargs
+        **model_kwargs,
     ):
         input_shape = input_ids.shape
 
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index 07e740577a06c3..4338c098ba79f6 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -154,7 +154,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -214,7 +214,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -243,7 +242,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -342,7 +341,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
@@ -584,7 +583,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py
index cbd92b412ba385..8a9e2dba958398 100644
--- a/src/transformers/models/roformer/configuration_roformer.py
+++ b/src/transformers/models/roformer/configuration_roformer.py
@@ -126,7 +126,7 @@ def __init__(
         pad_token_id=0,
         rotary_value=False,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/roformer/modeling_flax_roformer.py b/src/transformers/models/roformer/modeling_flax_roformer.py
index d18640d29b5760..d95a4d73832e9a 100644
--- a/src/transformers/models/roformer/modeling_flax_roformer.py
+++ b/src/transformers/models/roformer/modeling_flax_roformer.py
@@ -16,11 +16,10 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -622,7 +621,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index 33a85deac9ba76..3631b9704f6ec6 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -572,7 +572,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/roformer/modeling_tf_roformer.py b/src/transformers/models/roformer/modeling_tf_roformer.py
index 018595d64baa27..952250e68a04fb 100644
--- a/src/transformers/models/roformer/modeling_tf_roformer.py
+++ b/src/transformers/models/roformer/modeling_tf_roformer.py
@@ -623,7 +623,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index 5ab1f694ad9a0f..6c0b6cd4f3c281 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -364,7 +364,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py
index 7b2cab56886200..88ccf183d17462 100644
--- a/src/transformers/models/roformer/tokenization_roformer_fast.py
+++ b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -106,7 +106,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index 994ecba2097d15..44a835c4b06cab 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -122,7 +122,7 @@ def __init__(
         layer_norm_eps=1e-6,
         decoder_hidden_size=256,
         semantic_loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -155,7 +155,6 @@ def __init__(
 
 
 class SegformerOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
index 00dddc9974a953..024a8699b01e8b 100644
--- a/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
+++ b/src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
@@ -20,11 +20,11 @@
 from collections import OrderedDict
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     SegformerConfig,
     SegformerFeatureExtractor,
diff --git a/src/transformers/models/segformer/image_processing_segformer.py b/src/transformers/models/segformer/image_processing_segformer.py
index f52e7d0d00b4db..1920c6668b0c19 100644
--- a/src/transformers/models/segformer/image_processing_segformer.py
+++ b/src/transformers/models/segformer/image_processing_segformer.py
@@ -96,7 +96,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_reduce_labels: bool = False,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "reduce_labels" in kwargs:
             warnings.warn(
@@ -146,7 +146,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -173,7 +173,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -194,7 +194,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 4ac9a87406497e..c877e86acfab4c 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -107,7 +107,7 @@ def __init__(
         hidden_size: int,
         num_attention_heads: int,
         sequence_reduction_ratio: int,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.hidden_size = hidden_size
@@ -213,7 +213,7 @@ def __init__(
         hidden_size: int,
         num_attention_heads: int,
         sequence_reduction_ratio: int,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.self = TFSegformerEfficientSelfAttention(
@@ -262,7 +262,7 @@ def __init__(
         in_features: int,
         hidden_features: int = None,
         out_features: int = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         out_features = out_features or in_features
@@ -296,7 +296,7 @@ def __init__(
         drop_path: float,
         sequence_reduction_ratio: int,
         mlp_ratio: int,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
index 1c69818fa156b5..af7041843de3a9 100644
--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -190,7 +190,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
index 58c0338a850d0f..81c3284af8ef6e 100644
--- a/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
@@ -25,6 +25,7 @@
 
 # Register SEW's fairseq modules
 from sew_asapp import tasks  # noqa: F401
+
 from transformers import (
     SEWConfig,
     SEWForCTC,
diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py
index 3f308891ec2b49..12ab9232ac8d82 100644
--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -1053,7 +1053,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py
index 845d3698e20e96..adf2ff04b8d6d8 100644
--- a/src/transformers/models/sew_d/configuration_sew_d.py
+++ b/src/transformers/models/sew_d/configuration_sew_d.py
@@ -214,7 +214,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
index 942add470b9c68..7844d7912f2c8b 100644
--- a/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/sew_d/convert_sew_d_original_pytorch_checkpoint_to_pytorch.py
@@ -25,6 +25,7 @@
 
 # Register SEW's fairseq modules
 from sew_asapp import tasks  # noqa: F401
+
 from transformers import (
     SEWDConfig,
     SEWDForCTC,
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index b25d7be0c7ebed..9ddfc7821f4088 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -1132,7 +1132,6 @@ def forward(
         rel_embeddings = self.get_rel_embedding()
         output_states = next_kv
         for i, layer_module in enumerate(self.layer):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (output_states,)
 
@@ -1594,7 +1593,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
index 8ecf0967635708..d7e7bdf57fc913 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py
@@ -345,9 +345,8 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
-
         if not _do_init:
             raise ValueError(
                 "`FlaxSpeechEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
@@ -616,7 +615,6 @@ def decode(
         def _decoder_forward(
             module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
         ):
-
             projection_module = module._get_projection_module()
             decoder_module = module._get_decoder_module()
 
@@ -750,7 +748,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -787,7 +785,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> FlaxPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index dab31b947051be..0bc4134d4b554a 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -290,7 +290,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
index 7c22e20e4d49b0..8bad1972e09215 100644
--- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
@@ -144,7 +144,7 @@ def __init__(
         conv_channels=1024,
         input_feat_per_channel=80,
         input_channels=1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index 1492ce942f86bb..a5e6b0d4004264 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -68,7 +68,7 @@ def __init__(
         do_ceptral_normalize=True,
         normalize_means=True,
         normalize_vars=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.num_mel_bins = num_mel_bins
@@ -133,7 +133,7 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
         return_attention_mask: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index e1c8d467e3160f..e68009a3ae15a6 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -1048,7 +1048,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
@@ -1074,7 +1073,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1402,7 +1400,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 1994417d7b71a5..85333f4e0dd896 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -1168,7 +1168,7 @@ def call(
         output_hidden_states=None,
         return_dict=None,
         training=False,
-        **kwargs
+        **kwargs,
     ):
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1279,7 +1279,7 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: bool = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, TFSeq2SeqModelOutput]:
         outputs = self.model(
             input_features=input_features,
@@ -1370,7 +1370,7 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         training: Optional[bool] = False,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, TFSeq2SeqLMOutput]:
         r"""
         labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1484,7 +1484,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
index fddbc8c7afd47f..596f6bea0bbce9 100644
--- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
@@ -110,7 +110,7 @@ def __init__(
         bos_token_id=0,
         eos_token_id=2,
         max_target_positions=1024,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index f0452e4df1a619..8d5c508b9808c3 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -657,7 +657,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
@@ -683,7 +682,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
diff --git a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
index 3365dfe382ae6f..4c90ba05ba31c9 100644
--- a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -108,7 +108,7 @@ def __init__(
         unk_token="<unk>",
         do_lower_case=False,
         merges_file=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             unk_token=unk_token,
diff --git a/src/transformers/models/speecht5/configuration_speecht5.py b/src/transformers/models/speecht5/configuration_speecht5.py
index c36d6e22faf938..9385e44dc664ea 100644
--- a/src/transformers/models/speecht5/configuration_speecht5.py
+++ b/src/transformers/models/speecht5/configuration_speecht5.py
@@ -246,7 +246,7 @@ def __init__(
         decoder_max_relative_position=160,
         use_cache=True,
         is_encoder_decoder=True,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/speecht5/feature_extraction_speecht5.py b/src/transformers/models/speecht5/feature_extraction_speecht5.py
index 4f08b1cf577940..7b4f2af17367ef 100644
--- a/src/transformers/models/speecht5/feature_extraction_speecht5.py
+++ b/src/transformers/models/speecht5/feature_extraction_speecht5.py
@@ -91,7 +91,7 @@ def __init__(
         mel_floor: float = 1e-10,
         reduction_factor: int = 2,
         return_attention_mask: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.do_normalize = do_normalize
@@ -242,7 +242,7 @@ def __call__(
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
@@ -358,7 +358,7 @@ def _process_audio(
         pad_to_multiple_of: Optional[int] = None,
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         is_batched = bool(
             isinstance(speech, (list, tuple))
diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
index f4d51d665f4e9a..d9c381fdc86d14 100644
--- a/src/transformers/models/speecht5/modeling_speecht5.py
+++ b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -1440,7 +1440,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
-
         hidden_states, attention_mask = self.prenet(input_values, attention_mask)
 
         outputs = self.wrapped_encoder(
@@ -1484,7 +1483,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
-
         hidden_states = self.prenet(input_values)
 
         outputs = self.wrapped_encoder(
@@ -1522,7 +1520,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
-
         return self.wrapped_encoder(
             hidden_states=input_values,
             attention_mask=attention_mask,
@@ -1792,7 +1789,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-
         decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
 
         outputs = self.wrapped_decoder(
@@ -1846,7 +1842,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-
         decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)
 
         outputs = self.wrapped_decoder(
@@ -1894,7 +1889,6 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-
         outputs = self.wrapped_decoder(
             hidden_states=input_values,
             attention_mask=attention_mask,
@@ -2381,7 +2375,7 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past is not None:
diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py
index 4b641558b7c0f1..a0b933f305610f 100644
--- a/src/transformers/models/speecht5/tokenization_speecht5.py
+++ b/src/transformers/models/speecht5/tokenization_speecht5.py
@@ -98,7 +98,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
diff --git a/src/transformers/models/splinter/configuration_splinter.py b/src/transformers/models/splinter/configuration_splinter.py
index 60b2580bec1aff..bdbe5f013143a6 100644
--- a/src/transformers/models/splinter/configuration_splinter.py
+++ b/src/transformers/models/splinter/configuration_splinter.py
@@ -107,7 +107,7 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         question_token_id=104,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py
index 954e0aa356d524..c44a9886640899 100755
--- a/src/transformers/models/splinter/modeling_splinter.py
+++ b/src/transformers/models/splinter/modeling_splinter.py
@@ -450,7 +450,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py
index 40daeb09465ad1..308680940db106 100644
--- a/src/transformers/models/splinter/tokenization_splinter.py
+++ b/src/transformers/models/splinter/tokenization_splinter.py
@@ -135,7 +135,7 @@ def __init__(
         question_token="[QUESTION]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -192,7 +192,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/splinter/tokenization_splinter_fast.py b/src/transformers/models/splinter/tokenization_splinter_fast.py
index 6eb69755905ace..97db72caadc05c 100644
--- a/src/transformers/models/splinter/tokenization_splinter_fast.py
+++ b/src/transformers/models/splinter/tokenization_splinter_fast.py
@@ -113,7 +113,7 @@ def __init__(
         question_token="[QUESTION]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/squeezebert/configuration_squeezebert.py b/src/transformers/models/squeezebert/configuration_squeezebert.py
index 639be83a6c5ab3..5757b9410fce40 100644
--- a/src/transformers/models/squeezebert/configuration_squeezebert.py
+++ b/src/transformers/models/squeezebert/configuration_squeezebert.py
@@ -134,7 +134,7 @@ def __init__(
         post_attention_groups=1,
         intermediate_groups=4,
         output_groups=4,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index a74ad5897b2adb..4f666bb7f8f098 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -315,7 +315,6 @@ def forward(
         output_hidden_states=False,
         return_dict=True,
     ):
-
         if head_mask is None:
             head_mask_is_all_none = True
         elif head_mask.count(None) == len(head_mask):
@@ -331,7 +330,6 @@ def forward(
         all_attentions = () if output_attentions else None
 
         for layer in self.layers:
-
             if output_hidden_states:
                 hidden_states = hidden_states.permute(0, 2, 1)
                 all_hidden_states += (hidden_states,)
@@ -645,7 +643,6 @@ def forward(
 
 @add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING)
 class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel):
-
     _keys_to_ignore_on_load_missing = [
         r"predictions.decoder.bias",
         "cls.predictions.decoder.weight",
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py
index 00d450058238fd..ed7be941e0363a 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py
@@ -136,7 +136,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             do_lower_case=do_lower_case,
@@ -184,7 +184,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
index 0423c16fc33125..bf7659ffd18b4b 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
@@ -124,7 +124,7 @@ def __init__(
         mask_token="[MASK]",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             vocab_file,
diff --git a/src/transformers/models/swin/configuration_swin.py b/src/transformers/models/swin/configuration_swin.py
index 89c9d556b287d3..3eb21d5663bc24 100644
--- a/src/transformers/models/swin/configuration_swin.py
+++ b/src/transformers/models/swin/configuration_swin.py
@@ -129,7 +129,7 @@ def __init__(
         layer_norm_eps=1e-5,
         encoder_stride=32,
         out_features=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -168,7 +168,6 @@ def __init__(
 
 
 class SwinOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
index 302bd6f3f7d1ad..72e6bce5b09e5b 100644
--- a/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_simmim_to_pytorch.py
@@ -18,10 +18,10 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import SwinConfig, SwinForMaskedImageModeling, ViTFeatureExtractor
 
 
diff --git a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
index 860fdd1b54d2af..c5734757e60188 100644
--- a/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
+++ b/src/transformers/models/swin/convert_swin_timm_to_pytorch.py
@@ -1,12 +1,12 @@
 import argparse
 import json
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
+
 from transformers import AutoFeatureExtractor, SwinConfig, SwinForImageClassification
 
 
diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py
index 7b48cdc84d2ff2..abf47cf83135a8 100644
--- a/src/transformers/models/swin/modeling_swin.py
+++ b/src/transformers/models/swin/modeling_swin.py
@@ -741,7 +741,6 @@ def forward(
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
             layer_outputs = layer_module(
diff --git a/src/transformers/models/swin/modeling_tf_swin.py b/src/transformers/models/swin/modeling_tf_swin.py
index 46155a7d73ad20..1022990280a98f 100644
--- a/src/transformers/models/swin/modeling_tf_swin.py
+++ b/src/transformers/models/swin/modeling_tf_swin.py
@@ -788,7 +788,7 @@ def __init__(
         num_heads: int,
         drop_path: List[float],
         downsample: Optional[Callable],
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         self.config = config
diff --git a/src/transformers/models/swin2sr/configuration_swin2sr.py b/src/transformers/models/swin2sr/configuration_swin2sr.py
index 4547b5848a1ba2..79b1dda68eca73 100644
--- a/src/transformers/models/swin2sr/configuration_swin2sr.py
+++ b/src/transformers/models/swin2sr/configuration_swin2sr.py
@@ -128,7 +128,7 @@ def __init__(
         img_range=1.0,
         resi_connection="1conv",
         upsampler="pixelshuffle",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
index 38a11496f7ee46..6884bf0afc0cde 100644
--- a/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
+++ b/src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py
@@ -16,11 +16,11 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 from torchvision.transforms import Compose, Normalize, Resize, ToTensor
 
-import requests
 from transformers import Swin2SRConfig, Swin2SRForImageSuperResolution, Swin2SRImageProcessor
 
 
diff --git a/src/transformers/models/swin2sr/image_processing_swin2sr.py b/src/transformers/models/swin2sr/image_processing_swin2sr.py
index 62ec9db16c489d..24ee846e45976b 100644
--- a/src/transformers/models/swin2sr/image_processing_swin2sr.py
+++ b/src/transformers/models/swin2sr/image_processing_swin2sr.py
@@ -50,7 +50,7 @@ def __init__(
         rescale_factor: Union[int, float] = 1 / 255,
         do_pad: bool = True,
         pad_size: int = 8,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/swinv2/configuration_swinv2.py b/src/transformers/models/swinv2/configuration_swinv2.py
index 46d943cbe924e1..4859ccd51eb5b4 100644
--- a/src/transformers/models/swinv2/configuration_swinv2.py
+++ b/src/transformers/models/swinv2/configuration_swinv2.py
@@ -118,7 +118,7 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-5,
         encoder_stride=32,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
index 7af3bfb86c1741..ba70e707a949d8 100644
--- a/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
+++ b/src/transformers/models/swinv2/convert_swinv2_timm_to_pytorch.py
@@ -18,12 +18,12 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
+
 from transformers import AutoFeatureExtractor, Swinv2Config, Swinv2ForImageClassification
 
 
diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py
index e46decde4ca038..3104e5d2d20ef7 100644
--- a/src/transformers/models/swinv2/modeling_swinv2.py
+++ b/src/transformers/models/swinv2/modeling_swinv2.py
@@ -815,7 +815,6 @@ def forward(
     ) -> Tuple[torch.Tensor]:
         height, width = input_dimensions
         for i, layer_module in enumerate(self.blocks):
-
             layer_head_mask = head_mask[i] if head_mask is not None else None
 
             layer_outputs = layer_module(
diff --git a/src/transformers/models/switch_transformers/configuration_switch_transformers.py b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
index 0d84d7ee33ffaa..dd6c6c03e2cfae 100644
--- a/src/transformers/models/switch_transformers/configuration_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/configuration_switch_transformers.py
@@ -129,7 +129,7 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
diff --git a/src/transformers/models/switch_transformers/convert_big_switch.py b/src/transformers/models/switch_transformers/convert_big_switch.py
index aa44f9a2190d9d..86c673b48a4ede 100644
--- a/src/transformers/models/switch_transformers/convert_big_switch.py
+++ b/src/transformers/models/switch_transformers/convert_big_switch.py
@@ -2,12 +2,12 @@
 import json
 import os
 
-import torch
-from tensorflow.io import gfile
-
 import tensorstore as ts
+import torch
 from flax import serialization
 from flax.traverse_util import flatten_dict, unflatten_dict
+from tensorflow.io import gfile
+
 from transformers.modeling_utils import dtype_byte_size
 from transformers.models.switch_transformers.convert_switch_transformers_original_flax_checkpoint_to_pytorch import (
     rename_keys,
diff --git a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
index 45cd63e4743336..5937101169c6b4 100644
--- a/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
+++ b/src/transformers/models/switch_transformers/convert_switch_transformers_original_flax_checkpoint_to_pytorch.py
@@ -20,6 +20,7 @@
 
 from flax.traverse_util import flatten_dict, unflatten_dict
 from t5x import checkpoints
+
 from transformers import SwitchTransformersConfig, SwitchTransformersForConditionalGeneration
 from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
 from transformers.utils import logging
@@ -92,7 +93,6 @@ def rename_keys(s_dict):
     # 3. Take extra care of the EXPERTS layer
     for key in list(s_dict.keys()):
         if "expert" in key:
-
             num_experts = s_dict[key].shape[0]
             expert_weihts = s_dict[key]
             for idx in range(num_experts):
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index b19016bfab8bdd..77fe3be9c350e5 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -241,7 +241,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # SwitchTransformers uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -334,7 +333,6 @@ def forward(self, hidden_states):
 
         next_states = hidden_states.clone()
         for idx, expert in enumerate(self.experts.values()):
-
             token_indices = router_mask[:, :, idx].bool()
             next_states[token_indices] = expert(hidden_states[token_indices])
 
@@ -721,7 +719,6 @@ def forward(
         output_router_logits=True,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -939,7 +936,6 @@ def __init__(self, config, embed_tokens=None):
         config.num_layers = config.num_decoder_layers if self.is_decoder else config.num_layers
         self.block = nn.ModuleList()
         for i in range(config.num_layers):
-
             is_sparse = (i % sparse_step == 1) if sparse_step > 0 else False
 
             self.block.append(
@@ -1758,9 +1754,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py
index a2bd03dfd74cc8..1ad7a51412c6da 100644
--- a/src/transformers/models/t5/configuration_t5.py
+++ b/src/transformers/models/t5/configuration_t5.py
@@ -98,7 +98,7 @@ def __init__(
         use_cache=True,
         pad_token_id=0,
         eos_token_id=1,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
index 68e66c8298238f..11f32c8461e97c 100644
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
+++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_flax.py
@@ -18,6 +18,7 @@
 import argparse
 
 from t5x import checkpoints
+
 from transformers import FlaxT5ForConditionalGeneration, T5Config
 
 
diff --git a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
index f58cee4ae8269f..aebe5d88fca365 100755
--- a/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
+++ b/src/transformers/models/t5/convert_t5x_checkpoint_to_pytorch.py
@@ -32,9 +32,9 @@
 import collections
 
 import torch
-
 from flax import traverse_util
 from t5x import checkpoints
+
 from transformers import T5Config, T5EncoderModel, T5ForConditionalGeneration
 from transformers.utils import logging
 
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 1006458fbd5a92..249d4913e010ef 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -18,11 +18,10 @@
 import copy
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -943,7 +942,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -1439,7 +1438,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         # Encode if needed (training, first prediction pass)
         encoder_outputs = self.encoder(
             input_ids=input_ids,
@@ -1745,7 +1743,7 @@ def prepare_inputs_for_generation(
         attention_mask: Optional[jnp.DeviceArray] = None,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index f329f32d42d249..49adf4c421af34 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -245,7 +245,6 @@ def __init__(self, hidden_size, eps=1e-6):
         self.variance_epsilon = eps
 
     def forward(self, hidden_states):
-
         # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
         # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
         # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
@@ -666,7 +665,6 @@ def forward(
         output_attentions=False,
         return_dict=True,
     ):
-
         if past_key_value is not None:
             if not self.is_decoder:
                 logger.warning("`past_key_values` is passed to the encoder. Please make sure this is intended.")
@@ -1722,9 +1720,8 @@ def prepare_inputs_for_generation(
         cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index 0e420a85d4deca..f9996e15314e2b 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -547,7 +547,6 @@ def call(
         output_attentions=False,
         training=False,
     ):
-
         if past_key_value is not None:
             assert self.is_decoder, "Only decoder can use `past_key_values`"
             expected_num_past_key_values = 2 if encoder_hidden_states is None else 4
@@ -663,7 +662,6 @@ def call(
         return_dict=None,
         training=False,
     ) -> Tuple:
-
         if input_ids is not None and inputs_embeds is not None:
             err_msg_prefix = "decoder_" if self.is_decoder else ""
             raise ValueError(
@@ -1505,9 +1503,8 @@ def prepare_inputs_for_generation(
         decoder_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
-
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
             input_ids = input_ids[:, -1:]
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 4bdd3a907710c1..400c956a3d5771 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -120,7 +120,7 @@ def __init__(
         extra_ids=100,
         additional_special_tokens=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Add extra_ids to the special token list
         if extra_ids > 0 and additional_special_tokens is None:
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index 215df486786de0..589a346ed019bd 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -115,7 +115,7 @@ def __init__(
         pad_token="<pad>",
         extra_ids=100,
         additional_special_tokens=None,
-        **kwargs
+        **kwargs,
     ):
         # Add extra_ids to the special token list
         if extra_ids > 0 and additional_special_tokens is None:
diff --git a/src/transformers/models/table_transformer/configuration_table_transformer.py b/src/transformers/models/table_transformer/configuration_table_transformer.py
index bc7bd6034e4efe..d74424ce69fe3a 100644
--- a/src/transformers/models/table_transformer/configuration_table_transformer.py
+++ b/src/transformers/models/table_transformer/configuration_table_transformer.py
@@ -176,7 +176,7 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
-        **kwargs
+        **kwargs,
     ):
         if backbone_config is not None and use_timm_backbone:
             raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
@@ -238,7 +238,6 @@ def hidden_size(self) -> int:
 
 # Copied from transformers.models.detr.configuration_detr.DetrOnnxConfig
 class TableTransformerOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
index a53bd9e03d80d7..1973fe82e9d550 100644
--- a/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/table_transformer/convert_table_transformer_original_pytorch_checkpoint_to_pytorch.py
@@ -23,10 +23,10 @@
 from pathlib import Path
 
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 from torchvision.transforms import functional as F
 
-from huggingface_hub import hf_hub_download
 from transformers import DetrFeatureExtractor, TableTransformerConfig, TableTransformerForObjectDetection
 from transformers.utils import logging
 
diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py
index 71fd5715ef57fb..f466ab42545f04 100644
--- a/src/transformers/models/tapas/configuration_tapas.py
+++ b/src/transformers/models/tapas/configuration_tapas.py
@@ -193,9 +193,8 @@ def __init__(
         disable_per_token_loss=False,
         aggregation_labels=None,
         no_aggregation_label_index=None,
-        **kwargs
+        **kwargs,
     ):
-
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
         # BERT hyperparameters (with updated max_position_embeddings and type_vocab_sizes)
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 55243f01fcdaba..83a3f9fefc9860 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -289,7 +289,6 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs
             position_ids = position_ids.unsqueeze(0).expand(input_shape)
             # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings
             if self.config.reset_position_index_per_cell:
-
                 # shape (batch_size, seq_len)
                 col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1)
                 # shape (batch_size, seq_len)
@@ -1023,7 +1022,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        **kwargs
+        **kwargs,
     ) -> Union[Tuple, MaskedLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index 1d25e04dd80df8..5c995aa93014a2 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -216,7 +216,6 @@ def call(
             position_ids = tf.broadcast_to(position_ids, shape=input_shape)
             # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings
             if self.reset_position_index_per_cell:
-
                 # shape (batch_size, seq_len)
                 col_index = IndexMap(token_type_ids[:, :, 1], self.config.type_vocab_sizes[1], batch_dims=1)
                 # shape (batch_size, seq_len)
@@ -779,7 +778,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index d3c3d934dbd9e4..395ec876c9f410 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -340,7 +340,7 @@ def __init__(
         max_question_length=None,
         model_max_length: int = 512,
         additional_special_tokens: Optional[List[str]] = None,
-        **kwargs
+        **kwargs,
     ):
         if not is_pandas_available():
             raise ImportError("Pandas is required for the TAPAS tokenizer.")
@@ -418,7 +418,6 @@ def _tokenize(self, text):
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-
                 # If the token is part of the never_split set
                 if token in self.basic_tokenizer.never_split:
                     split_tokens.append(token)
@@ -602,7 +601,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) related to a table.
@@ -716,7 +715,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare a table and a list of strings for the model.
@@ -823,7 +822,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         table_tokens = self._tokenize_table(table)
 
@@ -882,7 +881,7 @@ def _batch_prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         batch_outputs = {}
 
@@ -944,7 +943,7 @@ def encode(
         truncation: Union[bool, str, TapasTruncationStrategy] = False,
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Prepare a table and a string for the model. This method does not return token type IDs, attention masks, etc.
@@ -996,7 +995,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare a table and a string for the model.
@@ -1077,7 +1076,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if query is None:
             query = ""
@@ -1136,7 +1135,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id so that it can be used by the model. It adds special tokens, truncates
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py
index c91c9b16ae7abc..c41c6cbe47ae94 100644
--- a/src/transformers/models/tapex/tokenization_tapex.py
+++ b/src/transformers/models/tapex/tokenization_tapex.py
@@ -284,7 +284,7 @@ def __init__(
         mask_token="<mask>",
         add_prefix_space=False,
         max_cell_length=15,
-        **kwargs
+        **kwargs,
     ):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
@@ -531,7 +531,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several table-sequence pair(s).
@@ -608,7 +608,7 @@ def source_call_func(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Input type checking for clearer error
         valid_table = False
@@ -695,7 +695,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         <Tip warning={true}>
@@ -753,9 +753,8 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -875,7 +874,7 @@ def encode(
         truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
@@ -914,7 +913,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
@@ -964,7 +963,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
@@ -1020,7 +1019,7 @@ def target_call_func(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         The method tokenizes and prepares the answer label for the model.
@@ -1084,7 +1083,7 @@ def target_batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare answer strings for the model.
@@ -1138,12 +1137,10 @@ def _target_batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         batch_outputs = {}
         for text in answer:
-
             if self.do_lower_case:
                 text = text.lower()
 
@@ -1191,7 +1188,7 @@ def target_encode(
         truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
         max_length: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
@@ -1229,7 +1226,7 @@ def target_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepare a answer string for the model.
@@ -1282,7 +1279,7 @@ def _target_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         if return_offsets_mapping:
             raise NotImplementedError(
diff --git a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
index 8d89d5cd7f19f1..7066d1a2d84c9e 100644
--- a/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/configuration_time_series_transformer.py
@@ -161,7 +161,7 @@ def __init__(
         num_parallel_samples: int = 100,
         init_std: float = 0.02,
         use_cache=True,
-        **kwargs
+        **kwargs,
     ):
         # time series specific configuration
         self.prediction_length = prediction_length
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index e9f412c1bfec78..acc64332a76893 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -1347,7 +1347,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -1372,7 +1371,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
diff --git a/src/transformers/models/timesformer/configuration_timesformer.py b/src/transformers/models/timesformer/configuration_timesformer.py
index d3db62e53d25f5..2e30a8c5dff268 100644
--- a/src/transformers/models/timesformer/configuration_timesformer.py
+++ b/src/transformers/models/timesformer/configuration_timesformer.py
@@ -104,7 +104,7 @@ def __init__(
         qkv_bias=True,
         attention_type="divided_space_time",
         drop_path_rate=0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
index ca58994db82836..5a66776d4381a3 100644
--- a/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
+++ b/src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
@@ -17,11 +17,11 @@
 import argparse
 import json
 
+import gdown
 import numpy as np
 import torch
-
-import gdown
 from huggingface_hub import hf_hub_download
+
 from transformers import TimesformerConfig, TimesformerForVideoClassification, VideoMAEFeatureExtractor
 
 
diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py
index 611d854262fd93..b5a525127b3def 100644
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -235,7 +235,6 @@ def __init__(self, config: TimesformerConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -274,7 +273,6 @@ def __init__(self, config: TimesformerConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
         hidden_states = self.dropout(hidden_states)
diff --git a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
index 4042d79ea5f6f8..875980fde19e7b 100644
--- a/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
+++ b/src/transformers/models/trajectory_transformer/configuration_trajectory_transformer.py
@@ -133,7 +133,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=50256,
         eos_token_id=50256,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.action_weight = action_weight
diff --git a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
index 14e6556e07b7a1..622552fa783608 100644
--- a/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
@@ -15,8 +15,8 @@
 """ TrajectoryTransformer pytorch checkpoint conversion"""
 
 import torch
-
 import trajectory.utils as utils
+
 from transformers import TrajectoryTransformerModel
 
 
diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
index cf41166b9390a9..fee99ce4e56350 100644
--- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
+++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py
@@ -538,7 +538,6 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
 
         for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)):
-
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
diff --git a/src/transformers/models/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
index c9b8464b1039de..8550e71802867a 100644
--- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
@@ -141,7 +141,7 @@ def __init__(
         init_std=0.02,
         layer_norm_epsilon=1e-5,
         eos_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.cutoffs = []
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
index f2c4653b7298dd..93af2165111288 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -128,7 +128,7 @@ def __init__(
         layer_norm_epsilon=1e-5,
         init_std=0.02,
         output_attentions=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -281,7 +281,7 @@ def __init__(
         layer_norm_epsilon=1e-5,
         init_std=0.02,
         output_attentions=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -551,7 +551,6 @@ def call(
         labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
         training: bool = False,
     ):
-
         # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
         # so we transpose here from shape [bsz, len] to shape [len, bsz]
         if input_ids is not None and inputs_embeds is not None:
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
index fb0157dd6f1b09..094b2d33f6855c 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -381,7 +381,6 @@ def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon
         )
 
     def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
-
         attn_outputs = self.dec_attn(
             dec_inp,
             r,
@@ -860,7 +859,6 @@ def _update_mems(self, hids, mems, mlen, qlen):
             end_idx = mlen + max(0, qlen)
             beg_idx = max(0, end_idx - self.mem_len)
             for i in range(len(hids)):
-
                 cat = torch.cat([mems[i], hids[i]], dim=0)
                 new_mems.append(cat[beg_idx:end_idx].detach())
 
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index 5b284a219a4753..13977d43828051 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -179,7 +179,7 @@ def __init__(
         eos_token="<eos>",
         additional_special_tokens=["<formula>"],
         language="en",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             special=special,
@@ -643,7 +643,6 @@ def __iter__(self):
 
 class LMMultiFileIterator(LMShuffledIterator):
     def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
-
         self.paths = paths
         self.vocab = vocab
 
diff --git a/src/transformers/models/trocr/configuration_trocr.py b/src/transformers/models/trocr/configuration_trocr.py
index 901370c31eeb32..b3f03373618484 100644
--- a/src/transformers/models/trocr/configuration_trocr.py
+++ b/src/transformers/models/trocr/configuration_trocr.py
@@ -121,7 +121,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.d_model = d_model
diff --git a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
index 997fd747621094..26291296817bb2 100644
--- a/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
+++ b/src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
@@ -18,10 +18,10 @@
 import argparse
 from pathlib import Path
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import (
     RobertaTokenizer,
     TrOCRConfig,
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index df7f6c569f474e..98ab48a938c5ed 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -689,7 +689,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
@@ -715,7 +714,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index d041075c9b626e..f4e8df659e9a01 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -226,7 +226,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         replace_prob=0.5,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index af7dc87e673ed9..d2866652377196 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1423,7 +1423,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index 3205bbc2cca805..222f982fe769bb 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -239,7 +239,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         num_clusters=504,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 7ce1b4465b0ddb..1b197108ef43de 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1430,7 +1430,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1607,8 +1606,7 @@ def __init__(self, config):
 
         if hasattr(config, "add_adapter") and config.add_adapter:
             raise ValueError(
-                "Audio frame classification does not support the use of UniSpeechSat adapters"
-                " (config.add_adapter=True)"
+                "Audio frame classification does not support the use of UniSpeechSat adapters (config.add_adapter=True)"
             )
         self.unispeech_sat = UniSpeechSatModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py
index 449e69def3e35b..593e8953a27358 100644
--- a/src/transformers/models/upernet/configuration_upernet.py
+++ b/src/transformers/models/upernet/configuration_upernet.py
@@ -85,7 +85,7 @@ def __init__(
         auxiliary_num_convs=1,
         auxiliary_concat_input=False,
         loss_ignore_index=255,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
index 0b7b6e11b11d91..eeb3ab5fc99381 100644
--- a/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
+++ b/src/transformers/models/upernet/convert_convnext_upernet_to_pytorch.py
@@ -17,11 +17,11 @@
 import argparse
 import json
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import ConvNextConfig, SegformerImageProcessor, UperNetConfig, UperNetForSemanticSegmentation
 
 
diff --git a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
index a44549a4703cbc..9580af7c46a50c 100644
--- a/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
+++ b/src/transformers/models/upernet/convert_swin_upernet_to_pytorch.py
@@ -20,11 +20,11 @@
 import argparse
 import json
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import SegformerImageProcessor, SwinConfig, UperNetConfig, UperNetForSemanticSegmentation
 
 
diff --git a/src/transformers/models/van/configuration_van.py b/src/transformers/models/van/configuration_van.py
index 47d5a9b6c11aa1..85a0dd20e47728 100644
--- a/src/transformers/models/van/configuration_van.py
+++ b/src/transformers/models/van/configuration_van.py
@@ -94,7 +94,7 @@ def __init__(
         layer_scale_init_value=1e-2,
         drop_path_rate=0.0,
         dropout_rate=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.image_size = image_size
diff --git a/src/transformers/models/van/convert_van_to_pytorch.py b/src/transformers/models/van/convert_van_to_pytorch.py
index ded3c3500dad24..a8086e6d1b511b 100644
--- a/src/transformers/models/van/convert_van_to_pytorch.py
+++ b/src/transformers/models/van/convert_van_to_pytorch.py
@@ -27,9 +27,9 @@
 
 import torch
 import torch.nn as nn
+from huggingface_hub import cached_download, hf_hub_download
 from torch import Tensor
 
-from huggingface_hub import cached_download, hf_hub_download
 from transformers import AutoFeatureExtractor, VanConfig, VanForImageClassification
 from transformers.models.van.modeling_van import VanLayerScaling
 from transformers.utils import logging
diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index 932c4c1d98cabf..8120bb23fc2a6c 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -119,7 +119,7 @@ def __init__(
         decoder_num_hidden_layers=4,
         decoder_intermediate_size=1536,
         norm_pix_loss=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 2f4ce5d44704a7..68f6372ab53ed6 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -17,11 +17,11 @@
 import argparse
 import json
 
+import gdown
 import numpy as np
 import torch
-
-import gdown
 from huggingface_hub import hf_hub_download
+
 from transformers import (
     VideoMAEConfig,
     VideoMAEFeatureExtractor,
diff --git a/src/transformers/models/videomae/image_processing_videomae.py b/src/transformers/models/videomae/image_processing_videomae.py
index 451d2461c344c7..e3edfdf00d27a0 100644
--- a/src/transformers/models/videomae/image_processing_videomae.py
+++ b/src/transformers/models/videomae/image_processing_videomae.py
@@ -115,7 +115,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -140,7 +140,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -171,7 +171,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `size` along any
@@ -195,7 +195,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -216,7 +216,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 8831f65a49b19c..ee166317909e9e 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -106,6 +106,7 @@ class VideoMAEForPreTrainingOutput(ModelOutput):
 # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
 def get_sinusoid_encoding_table(n_position, d_hid):
     """Sinusoid position encoding table"""
+
     # TODO: make it with torch instead of numpy
     def get_position_angle_vec(position):
         return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
@@ -236,7 +237,6 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-
         k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
         keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
         values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
@@ -286,7 +286,6 @@ def __init__(self, config: VideoMAEConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -344,7 +343,6 @@ def __init__(self, config: VideoMAEConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/vilt/configuration_vilt.py b/src/transformers/models/vilt/configuration_vilt.py
index 7a5c856413a08e..3c5935f1f1b22a 100644
--- a/src/transformers/models/vilt/configuration_vilt.py
+++ b/src/transformers/models/vilt/configuration_vilt.py
@@ -120,7 +120,7 @@ def __init__(
         max_image_length=-1,
         tie_word_embeddings=False,
         num_images=-1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
diff --git a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
index 5e737f784c81b2..693ca34efcf6e3 100644
--- a/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+++ b/src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import (
     BertTokenizer,
     ViltConfig,
diff --git a/src/transformers/models/vilt/image_processing_vilt.py b/src/transformers/models/vilt/image_processing_vilt.py
index 9e50c1f7d58f6b..783197c7e956b3 100644
--- a/src/transformers/models/vilt/image_processing_vilt.py
+++ b/src/transformers/models/vilt/image_processing_vilt.py
@@ -165,7 +165,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -204,7 +204,7 @@ def resize(
         size_divisor: int = 32,
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image.
@@ -238,7 +238,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -259,7 +259,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py
index 0be05ea4aa1e93..61cc69b694bf09 100755
--- a/src/transformers/models/vilt/modeling_vilt.py
+++ b/src/transformers/models/vilt/modeling_vilt.py
@@ -399,7 +399,6 @@ def __init__(self, config: ViltConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -451,7 +450,6 @@ def __init__(self, config: ViltConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
@@ -1419,7 +1417,6 @@ def forward(
     VILT_START_DOCSTRING,
 )
 class ViltForTokenClassification(ViltPreTrainedModel):
-
     _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
     def __init__(self, config):
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 2578724066bec2..4fd8d9188faa49 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -77,7 +77,7 @@ def __call__(
         return_length: bool = False,
         verbose: bool = True,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         This method uses [`ViltImageProcessor.__call__`] method to prepare image(s) for the model, and
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 0bdc6366867c2c..8561875ed59957 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -284,7 +284,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         if not _do_init:
             raise ValueError(
@@ -553,7 +553,6 @@ def decode(
         def _decoder_forward(
             module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
         ):
-
             projection_module = module._get_projection_module()
             decoder_module = module._get_decoder_module()
 
@@ -691,7 +690,7 @@ def prepare_inputs_for_generation(
         max_length,
         decoder_attention_mask: Optional[jnp.DeviceArray] = None,
         encoder_outputs=None,
-        **kwargs
+        **kwargs,
     ):
         # initializing the cache
         batch_size, seq_length = decoder_input_ids.shape
@@ -727,7 +726,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> FlaxPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 88d9b83bea6bab..b3d3c2b5707e31 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -147,7 +147,6 @@
 
 # Copied from transformers.models.encoder_decoder.modeling_tf_encoder_decoder.shift_tokens_right
 def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
-
     if pad_token_id is None:
         raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
     pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
@@ -368,7 +367,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> TFPreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
@@ -603,7 +602,6 @@ def call(
                 )
 
         if encoder_outputs is None:
-
             encoder_inputs = {
                 "input_ids": pixel_values,
                 "output_attentions": output_attentions,
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 91c08901ce0014..34ead714658c2f 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -368,7 +368,7 @@ def from_encoder_decoder_pretrained(
         encoder_pretrained_model_name_or_path: str = None,
         decoder_pretrained_model_name_or_path: str = None,
         *model_args,
-        **kwargs
+        **kwargs,
     ) -> PreTrainedModel:
         r"""
         Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index 4b163965486308..12453fde98125b 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -227,9 +227,8 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
-
         if not _do_init:
             raise ValueError(
                 "`FlaxVisionTextDualEncoderModel` cannot be created without initializing, `_do_init` must be `True`."
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index d9c075f6d9f34a..3e39c6b005a979 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -169,7 +169,6 @@ def __init__(
         vision_model: Optional[PreTrainedModel] = None,
         text_model: Optional[PreTrainedModel] = None,
     ):
-
         if config is None and (vision_model is None or text_model is None):
             raise ValueError("Either a configuration or an vision and a text model has to be provided")
 
diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py
index f256a286a0bcd8..a7282ef2bb5387 100644
--- a/src/transformers/models/visual_bert/configuration_visual_bert.py
+++ b/src/transformers/models/visual_bert/configuration_visual_bert.py
@@ -130,7 +130,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index d9250d73b17086..38ec21aff57b7c 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -1587,7 +1587,6 @@ def forward(
         loss = None
 
         if labels is not None:
-
             # scores = batch x selected position x visual_feature
             # scores = selected_positions.bmm(visual_features.transpose(1,2))
             # label = batch x selected_postion x needed position
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index dcb3ac795217ab..ed760189634f58 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -106,7 +106,7 @@ def __init__(
         num_channels=3,
         qkv_bias=True,
         encoder_stride=16,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -127,7 +127,6 @@ def __init__(
 
 
 class ViTOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/vit/convert_dino_to_pytorch.py b/src/transformers/models/vit/convert_dino_to_pytorch.py
index 1a8ba21a658b8b..ceedc2603a74bf 100644
--- a/src/transformers/models/vit/convert_dino_to_pytorch.py
+++ b/src/transformers/models/vit/convert_dino_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
 from transformers.utils import logging
 
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index bc1f7f72dd5f3c..718e8624cc8f4a 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -19,12 +19,12 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
+
 from transformers import DeiTFeatureExtractor, ViTConfig, ViTFeatureExtractor, ViTForImageClassification, ViTModel
 from transformers.utils import logging
 
diff --git a/src/transformers/models/vit/image_processing_vit.py b/src/transformers/models/vit/image_processing_vit.py
index 4b089443b58f09..5ca0d932880875 100644
--- a/src/transformers/models/vit/image_processing_vit.py
+++ b/src/transformers/models/vit/image_processing_vit.py
@@ -81,7 +81,7 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"height": 224, "width": 224}
@@ -101,7 +101,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image to `(size["height"], size["width"])`.
@@ -157,7 +157,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
diff --git a/src/transformers/models/vit/modeling_flax_vit.py b/src/transformers/models/vit/modeling_flax_vit.py
index ff9230777bbb6b..1ab2671efd75bf 100644
--- a/src/transformers/models/vit/modeling_flax_vit.py
+++ b/src/transformers/models/vit/modeling_flax_vit.py
@@ -85,7 +85,6 @@
 
 
 class FlaxViTPatchEmbeddings(nn.Module):
-
     config: ViTConfig
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
 
@@ -356,7 +355,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
 
@@ -444,7 +442,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         if input_shape is None:
@@ -523,7 +521,6 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-
         hidden_states = self.embeddings(pixel_values, deterministic=deterministic)
 
         outputs = self.encoder(
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index acc4f68bf38efb..6d0c579a43db0a 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -65,7 +65,6 @@ def __init__(self, config: ViTConfig, **kwargs):
         self.config = config
 
     def build(self, input_shape: tf.TensorShape):
-
         num_patches = self.patch_embeddings.num_patches
         self.cls_token = self.add_weight(
             shape=(1, 1, self.config.hidden_size),
@@ -496,7 +495,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
@@ -685,7 +683,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
-
         outputs = self.vit(
             pixel_values=pixel_values,
             head_mask=head_mask,
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 79f61a392b7375..1fb4c4f1022eef 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -248,7 +248,6 @@ def __init__(self, config: ViTConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -304,7 +303,6 @@ def __init__(self, config: ViTConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
index abc9920782e07f..34b778a1f94414 100644
--- a/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/configuration_vit_hybrid.py
@@ -107,7 +107,7 @@ def __init__(
         num_channels=3,
         backbone_featmap_shape=[1, 1024, 24, 24],
         qkv_bias=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
index ad5847360f980e..e88ee246ba1c1d 100644
--- a/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
+++ b/src/transformers/models/vit_hybrid/convert_vit_hybrid_timm_to_pytorch.py
@@ -19,14 +19,14 @@
 import json
 from pathlib import Path
 
-import torch
-from PIL import Image
-
 import requests
 import timm
+import torch
 from huggingface_hub import hf_hub_download
+from PIL import Image
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+
 from transformers import (
     BitConfig,
     ViTHybridConfig,
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index 12bb63fc66feeb..8d7b5b83afe4b9 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -102,7 +102,7 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
@@ -128,7 +128,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BICUBIC,
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
@@ -155,7 +155,7 @@ def center_crop(
         image: np.ndarray,
         size: Dict[str, int],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
@@ -179,7 +179,7 @@ def rescale(
         image: np.ndarray,
         scale: Union[int, float],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Rescale an image by a scale factor. image = image * scale.
@@ -200,7 +200,7 @@ def normalize(
         mean: Union[float, List[float]],
         std: Union[float, List[float]],
         data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Normalize an image. image = (image - image_mean) / image_std.
@@ -233,7 +233,7 @@ def preprocess(
         do_convert_rgb: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> PIL.Image.Image:
         """
         Preprocess an image or batch of images.
diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
index 650e065cf3caaf..df35c13f689dca 100644
--- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py
@@ -267,7 +267,6 @@ def __init__(self, config: ViTHybridConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -325,7 +324,6 @@ def __init__(self, config: ViTHybridConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/vit_mae/configuration_vit_mae.py b/src/transformers/models/vit_mae/configuration_vit_mae.py
index b7167fbbdc149a..4c065421a9f19d 100644
--- a/src/transformers/models/vit_mae/configuration_vit_mae.py
+++ b/src/transformers/models/vit_mae/configuration_vit_mae.py
@@ -116,7 +116,7 @@ def __init__(
         decoder_intermediate_size=2048,
         mask_ratio=0.75,
         norm_pix_loss=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
index 4cf9a75b674b42..fc61d8924c8d01 100644
--- a/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
+++ b/src/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
@@ -16,10 +16,10 @@
 
 import argparse
 
+import requests
 import torch
 from PIL import Image
 
-import requests
 from transformers import ViTMAEConfig, ViTMAEFeatureExtractor, ViTMAEForPreTraining
 
 
diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py
index 119b7ff3783444..ef0c7c9f36869e 100755
--- a/src/transformers/models/vit_mae/modeling_vit_mae.py
+++ b/src/transformers/models/vit_mae/modeling_vit_mae.py
@@ -388,7 +388,6 @@ def __init__(self, config: ViTMAEConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -446,7 +445,6 @@ def __init__(self, config: ViTMAEConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/vit_msn/configuration_vit_msn.py b/src/transformers/models/vit_msn/configuration_vit_msn.py
index 057824e5d4e133..87d9a37a68e067 100644
--- a/src/transformers/models/vit_msn/configuration_vit_msn.py
+++ b/src/transformers/models/vit_msn/configuration_vit_msn.py
@@ -98,7 +98,7 @@ def __init__(
         patch_size=16,
         num_channels=3,
         qkv_bias=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
index f04d26d5eb886a..482073a4faa005 100644
--- a/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
+++ b/src/transformers/models/vit_msn/convert_msn_to_pytorch.py
@@ -17,11 +17,11 @@
 import argparse
 import json
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import ViTFeatureExtractor, ViTMSNConfig, ViTMSNModel
 from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 
diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py
index 61faa34eb978f4..d2c8547aa0d83a 100644
--- a/src/transformers/models/vit_msn/modeling_vit_msn.py
+++ b/src/transformers/models/vit_msn/modeling_vit_msn.py
@@ -238,7 +238,6 @@ def __init__(self, config: ViTMSNConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -296,7 +295,6 @@ def __init__(self, config: ViTMSNConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
index ab7d116118171f..7afcd3f0ee28e2 100644
--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -253,7 +253,7 @@ def __init__(
         adapter_stride=2,
         num_adapter_layers=3,
         output_hidden_size=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
index c02a23d8a44060..9550b7c2a9ef90 100644
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -71,7 +71,7 @@ def __init__(
         padding_value=0.0,
         return_attention_mask=False,
         do_normalize=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
         self.return_attention_mask = return_attention_mask
@@ -109,7 +109,7 @@ def __call__(
         return_attention_mask: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         sampling_rate: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 364d1590efa9d7..86cfb5e089ea00 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -17,12 +17,11 @@
 from functools import partial
 from typing import Optional, Tuple, Union
 
-import numpy as np
-
 import flax
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen.attention import dot_product_attention_weights
 from flax.traverse_util import flatten_dict, unflatten_dict
@@ -663,7 +662,6 @@ def __call__(
         output_hidden_states=False,
         return_dict=True,
     ):
-
         if attention_mask is not None:
             # make sure padded tokens are not attended to
             hidden_states = jnp.where(
@@ -1034,7 +1032,6 @@ def _conv_out_length(input_length, kernel_size, stride):
     def _get_feature_vector_attention_mask(
         self, feature_vector_length: int, attention_mask: jnp.ndarray, add_adapter=None
     ):
-
         # Effectively attention_mask.sum(-1), but not inplace to be able to run
         # on inference mode.
         non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index dfaf53099c0f5a..a3f2fd0e1e175f 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -357,7 +357,6 @@ def __init__(
         self._check_axis()
 
     def build(self, input_shape):
-
         self._check_if_input_shape_is_none(input_shape)
         self._set_number_of_groups_for_instance_norm(input_shape)
         self._check_size_of_dimensions(input_shape)
@@ -369,7 +368,6 @@ def build(self, input_shape):
         super().build(input_shape)
 
     def call(self, inputs):
-
         input_shape = tf.keras.backend.int_shape(inputs)
         tensor_input_shape = tf.shape(inputs)
 
@@ -406,7 +404,6 @@ def compute_output_shape(self, input_shape):
         return input_shape
 
     def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
-
         group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
         if not is_instance_norm:
@@ -419,7 +416,6 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
             return inputs, group_shape
 
     def _apply_normalization(self, reshaped_inputs, input_shape):
-
         group_shape = tf.keras.backend.int_shape(reshaped_inputs)
         group_reduction_axes = list(range(1, len(group_shape)))
         is_instance_norm = (input_shape[self.axis] // self.groups) == 1
@@ -471,7 +467,6 @@ def _set_number_of_groups_for_instance_norm(self, input_shape):
             self.groups = dim
 
     def _check_size_of_dimensions(self, input_shape):
-
         dim = input_shape[self.axis]
         if dim < self.groups:
             raise ValueError(
@@ -492,19 +487,16 @@ def _check_size_of_dimensions(self, input_shape):
             )
 
     def _check_axis(self):
-
         if self.axis == 0:
             raise ValueError(
                 "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
             )
 
     def _create_input_spec(self, input_shape):
-
         dim = input_shape[self.axis]
         self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
 
     def _add_gamma_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -520,7 +512,6 @@ def _add_gamma_weight(self, input_shape):
             self.gamma = None
 
     def _add_beta_weight(self, input_shape):
-
         dim = input_shape[self.axis]
         shape = (dim,)
 
@@ -1684,7 +1675,6 @@ def call(
         logits = self.lm_head(hidden_states)
 
         if labels is not None:
-
             if tf.reduce_max(labels) >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index c364d52807c615..3aa983f8e570ba 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1692,7 +1692,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index f4c4bea0722ffe..42fd1131cf0f26 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -168,7 +168,7 @@ def __init__(
         word_delimiter_token="|",
         replace_word_delimiter_char=" ",
         do_lower_case=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             unk_token=unk_token,
@@ -424,7 +424,7 @@ def batch_decode(
         clean_up_tokenization_spaces: bool = True,
         output_char_offsets: bool = False,
         output_word_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> List[str]:
         """
         Convert a list of lists of token ids into a list of strings by calling decode.
@@ -494,7 +494,7 @@ def decode(
         clean_up_tokenization_spaces: bool = True,
         output_char_offsets: bool = False,
         output_word_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -735,7 +735,7 @@ def __init__(
         do_lower_case=False,
         do_normalize=False,
         return_attention_mask=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             unk_token=unk_token,
@@ -803,7 +803,7 @@ def __call__(
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -888,7 +888,7 @@ def _decode(
         token_ids: List[int],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
diff --git a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
index 2a3f951b3960e6..b9f24c7e708057 100644
--- a/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@@ -266,7 +266,7 @@ def __init__(
         max_source_positions=5000,
         conv_depthwise_kernel_size=31,
         conformer_conv_dropout=0.1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
index c1dc7ae94a77cc..12ce81465fd806 100644
--- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
+++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -1604,10 +1604,10 @@ def __init__(self, config):
 
         if config.vocab_size is None:
             raise ValueError(
-                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
-                " vocabulary size of the language model head. Please instantiate the model as follows:"
-                " `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
-                " your model's configuration."
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
             )
         output_hidden_size = (
             config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
@@ -1668,7 +1668,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
@@ -1721,8 +1720,7 @@ def __init__(self, config):
 
         if hasattr(config, "add_adapter") and config.add_adapter:
             raise ValueError(
-                "Sequence classification does not support the use of Wav2Vec2Conformer adapters"
-                " (config.add_adapter=True)"
+                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
             )
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
@@ -1833,8 +1831,7 @@ def __init__(self, config):
 
         if hasattr(config, "add_adapter") and config.add_adapter:
             raise ValueError(
-                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters"
-                " (config.add_adapter=True)"
+                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
             )
         self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
         num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index c983c4be826430..74e2d3525b01c8 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -141,7 +141,7 @@ def __init__(
         do_phonemize=True,
         phonemizer_lang="en-us",
         phonemizer_backend="espeak",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             unk_token=unk_token,
@@ -453,7 +453,7 @@ def decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         output_char_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -509,7 +509,7 @@ def batch_decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         output_char_offsets: bool = False,
-        **kwargs
+        **kwargs,
     ) -> List[str]:
         """
         Convert a list of lists of token ids into a list of strings by calling decode.
diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py
index 4c97657d026b42..becbe100240d88 100644
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -246,7 +246,7 @@ def __init__(
         adapter_stride=2,
         num_adapter_layers=3,
         output_hidden_size=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
index 91758cc9595290..84e3d231ea3845 100644
--- a/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
@@ -19,8 +19,6 @@
 
 import torch
 
-from transformers import WavLMConfig, WavLMModel, logging
-
 # Step 1. clone https://github.com/microsoft/unilm
 # Step 2. git checkout to https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd
 # Step 3. cd unilm
@@ -29,6 +27,8 @@
 from unilm.wavlm.WavLM import WavLM as WavLMOrig
 from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig
 
+from transformers import WavLMConfig, WavLMModel, logging
+
 
 logging.set_verbosity_info()
 logger = logging.get_logger(__name__)
@@ -179,7 +179,6 @@ def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_gro
 
 @torch.no_grad()
 def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
-
     # load the pre-trained checkpoints
     checkpoint = torch.load(checkpoint_path)
     cfg = WavLMConfigOrig(checkpoint["cfg"])
diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py
index e4813447168888..f4e9305d6bcd9e 100755
--- a/src/transformers/models/wavlm/modeling_wavlm.py
+++ b/src/transformers/models/wavlm/modeling_wavlm.py
@@ -1352,7 +1352,6 @@ def forward(
 
         loss = None
         if labels is not None:
-
             if labels.max() >= self.config.vocab_size:
                 raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
 
diff --git a/src/transformers/models/whisper/configuration_whisper.py b/src/transformers/models/whisper/configuration_whisper.py
index 56159eb9359b14..934d54be48fcfb 100644
--- a/src/transformers/models/whisper/configuration_whisper.py
+++ b/src/transformers/models/whisper/configuration_whisper.py
@@ -185,7 +185,7 @@ def __init__(
         eos_token_id=50256,
         suppress_tokens=None,
         begin_suppress_tokens=[220, 50256],
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.num_mel_bins = num_mel_bins
diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py
index d838515fec859d..7c2e0c40a0422f 100644
--- a/src/transformers/models/whisper/convert_openai_to_hf.py
+++ b/src/transformers/models/whisper/convert_openai_to_hf.py
@@ -50,7 +50,6 @@ def remove_ignore_keys_(state_dict):
     "mlp.0": "fc1",
     "mlp.2": "fc2",
     "mlp_ln": "final_layer_norm",
-    "blocks": "layers",
     ".attn.query": ".self_attn.q_proj",
     ".attn.key": ".self_attn.k_proj",
     ".attn.value": ".self_attn.v_proj",
diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py
index 0b476ee34a302b..77c85a2477ade9 100644
--- a/src/transformers/models/whisper/feature_extraction_whisper.py
+++ b/src/transformers/models/whisper/feature_extraction_whisper.py
@@ -66,7 +66,7 @@ def __init__(
         n_fft=400,
         padding_value=0.0,
         return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             feature_size=feature_size,
@@ -225,7 +225,7 @@ def __call__(
         padding: Optional[str] = "max_length",
         max_length: Optional[int] = None,
         sampling_rate: Optional[int] = None,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Main method to featurize and prepare for the model one or several sequence(s).
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index e8148366ac882c..dd6ebcb2c8ccaf 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -142,7 +142,7 @@ def __init__(
         dropout: float = 0.0,
         is_decoder: bool = False,
         bias: bool = True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.embed_dim = embed_dim
@@ -1363,7 +1363,7 @@ def prepare_inputs_for_generation(
         encoder_outputs=None,
         attention_mask=None,
         decoder_attention_mask=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index 38bfe133a076b2..13e3ad7abbfb60 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -102,7 +102,6 @@ def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional
         super().__init__(num_positions, embedding_dim)
 
     def forward(self, input_ids, past_key_values_length=0):
-
         return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[-1]]
 
 
@@ -897,7 +896,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache ="
@@ -923,7 +921,6 @@ def custom_forward(*inputs):
                     None,  # past_key_value
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -1244,7 +1241,7 @@ def generate(
         task=None,
         language=None,
         is_multilingual=None,
-        **kwargs
+        **kwargs,
     ):
         """
 
@@ -1383,7 +1380,7 @@ def prepare_inputs_for_generation(
         use_cache=None,
         encoder_outputs=None,
         attention_mask=None,
-        **kwargs
+        **kwargs,
     ):
         # cut decoder_input_ids if past is used
         if past_key_values is not None:
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index 4f687b33448d2d..7e8f675a15b0eb 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -18,7 +18,6 @@
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
-
 import regex as re
 
 from ...tokenization_utils import AddedToken, PreTrainedTokenizer
@@ -269,9 +268,8 @@ def __init__(
         language=None,
         task=None,
         predict_timestamps=False,
-        **kwargs
+        **kwargs,
     ):
-
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
@@ -562,7 +560,7 @@ def decode(
         output_offsets: bool = False,
         time_precision=0.02,
         decode_with_timestamps: bool = False,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py
index 5a005e7a571954..ed2bdea69a50ac 100644
--- a/src/transformers/models/x_clip/configuration_x_clip.py
+++ b/src/transformers/models/x_clip/configuration_x_clip.py
@@ -100,7 +100,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
@@ -118,7 +118,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the text config dict if we are loading from XCLIPConfig
@@ -219,7 +218,7 @@ def __init__(
         initializer_range=0.02,
         initializer_factor=1.0,
         drop_path_rate=0.0,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -244,7 +243,6 @@ def __init__(
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
 
         # get the vision config dict if we are loading from XCLIPConfig
@@ -311,7 +309,7 @@ def __init__(
         prompt_attention_dropout=0.0,
         prompt_projection_dropout=0.0,
         logit_scale_init_value=2.6592,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
index 8210b3f709e39a..d4281fe849d61c 100644
--- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
+++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -15,11 +15,11 @@
 
 import argparse
 
+import gdown
 import numpy as np
 import torch
-
-import gdown
 from huggingface_hub import hf_hub_download
+
 from transformers import (
     CLIPTokenizer,
     CLIPTokenizerFast,
@@ -216,7 +216,6 @@ def prepare_video(num_frames):
 
 
 def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
-
     model_to_url = {
         # fully supervised kinetics-400 checkpoints
         "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
diff --git a/src/transformers/models/xglm/configuration_xglm.py b/src/transformers/models/xglm/configuration_xglm.py
index c9ac1111a08d78..8a59ee6682d6ea 100644
--- a/src/transformers/models/xglm/configuration_xglm.py
+++ b/src/transformers/models/xglm/configuration_xglm.py
@@ -114,7 +114,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
diff --git a/src/transformers/models/xglm/modeling_flax_xglm.py b/src/transformers/models/xglm/modeling_flax_xglm.py
index f6ae740624094f..5a2a8951f1a805 100644
--- a/src/transformers/models/xglm/modeling_flax_xglm.py
+++ b/src/transformers/models/xglm/modeling_flax_xglm.py
@@ -20,11 +20,10 @@
 from functools import partial
 from typing import Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -562,7 +561,7 @@ def __init__(
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
@@ -737,7 +736,6 @@ def __call__(
         return_dict: bool = True,
         deterministic: bool = True,
     ):
-
         outputs = self.model(
             input_ids,
             attention_mask,
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index 2efcd4b1e32703..1dac55651563cc 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -809,7 +809,6 @@ def call(
         training: Optional[bool] = False,
         **kwargs: Any,
     ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
-
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py
index ff02620d429c9d..6fb34917437c7d 100755
--- a/src/transformers/models/xglm/modeling_xglm.py
+++ b/src/transformers/models/xglm/modeling_xglm.py
@@ -739,7 +739,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache ="
@@ -765,7 +764,6 @@ def custom_forward(*inputs):
                     None,
                 )
             else:
-
                 layer_outputs = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py
index 4dc6c3d37f7cd4..f27c827134bf37 100644
--- a/src/transformers/models/xglm/tokenization_xglm.py
+++ b/src/transformers/models/xglm/tokenization_xglm.py
@@ -124,7 +124,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
diff --git a/src/transformers/models/xglm/tokenization_xglm_fast.py b/src/transformers/models/xglm/tokenization_xglm_fast.py
index a0d4cebafeef38..834b8a47ecb920 100644
--- a/src/transformers/models/xglm/tokenization_xglm_fast.py
+++ b/src/transformers/models/xglm/tokenization_xglm_fast.py
@@ -110,7 +110,7 @@ def __init__(
         cls_token="<s>",
         unk_token="<unk>",
         pad_token="<pad>",
-        **kwargs
+        **kwargs,
     ):
         # Compatibility with the original tokenizer
         self.num_madeup_words = 7
diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py
index 52a53d5bf0060e..cd8d721bfc37d2 100644
--- a/src/transformers/models/xlm/configuration_xlm.py
+++ b/src/transformers/models/xlm/configuration_xlm.py
@@ -192,7 +192,7 @@ def __init__(
         lang_id=0,
         pad_token_id=2,
         bos_token_id=0,
-        **kwargs
+        **kwargs,
     ):
         """Constructs XLMConfig."""
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index d054f535a42c4e..2d0adcddf95c21 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -102,7 +102,6 @@ def get_masks(slen, lengths, causal, padding_mask=None):
 
 
 class MultiHeadAttention(nn.Module):
-
     NEW_ID = itertools.count()
 
     def __init__(self, n_heads, dim, config):
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index 8bb021c5b96987..cbfb2b48ff0f1f 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -611,7 +611,7 @@ def __init__(
         lang2id=None,
         id2lang=None,
         do_lowercase_and_remove_accent=True,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(
             unk_token=unk_token,
diff --git a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
index cdca20ef3b43a8..29c8678f279981 100644
--- a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
@@ -134,7 +134,7 @@ def __init__(
         pad_token_id: Optional[int] = 0,
         bos_token_id: Optional[int] = 1,
         eos_token_id: Optional[int] = 2,
-        **kwargs
+        **kwargs,
     ):
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index c8b6fee9facd99..0327bb4f08e200 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -681,7 +681,6 @@ def forward(
         past_key_value: Optional[Tuple[Tensor]] = None,
         output_attentions: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
-
         batch_size, tgt_len, hidden_size = hidden_states.size()
 
         # if key_value_states are provided this layer is used as a cross-attention layer
@@ -1612,7 +1611,6 @@ def forward(
             past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index af8308287939f8..8468eb49d64d5a 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -141,7 +141,7 @@ def __init__(
         cls_token="[CLS]",
         mask_token="[MASK]",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
index c8bad5905a489d..98e12d07826edc 100644
--- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
@@ -132,7 +132,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
index 9ae7a9b8b8c9c1..ed5e113770e544 100644
--- a/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -17,11 +17,10 @@
 
 from typing import Callable, Optional, Tuple
 
-import numpy as np
-
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen import partitioning as nn_partitioning
@@ -752,7 +751,7 @@ def __init__(
         dtype: jnp.dtype = jnp.float32,
         _do_init: bool = True,
         gradient_checkpointing: bool = False,
-        **kwargs
+        **kwargs,
     ):
         module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
diff --git a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
index 5377bfdec19e30..b5fc694148e724 100644
--- a/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -723,7 +723,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
-
         if not self.config.is_decoder:
             use_cache = False
 
diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
index 1807cc67d524b7..30d75a29496667 100644
--- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -502,7 +502,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 40928d8dc30623..54a46842ff156d 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -145,7 +145,7 @@ def __init__(
         pad_token="<pad>",
         mask_token="<mask>",
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index f99e3c086a88c5..68f8df06ff3d24 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -147,7 +147,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
index a36fe66de66c5c..acf30bf3878a88 100644
--- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
@@ -119,7 +119,7 @@ def __init__(
         position_embedding_type="absolute",
         use_cache=True,
         classifier_dropout=None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
index a54d6835b92846..9250fa96398470 100644
--- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
+++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -491,7 +491,6 @@ def forward(
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
-
                 if use_cache:
                     logger.warning(
                         "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py
index 3aa05f77c985c0..9ebc1f8bb9fb6f 100644
--- a/src/transformers/models/xlnet/configuration_xlnet.py
+++ b/src/transformers/models/xlnet/configuration_xlnet.py
@@ -176,7 +176,7 @@ def __init__(
         pad_token_id=5,
         bos_token_id=1,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         """Constructs XLNetConfig."""
         self.vocab_size = vocab_size
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index 42ec90efc4d46a..080dd91f2301cd 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -597,7 +597,6 @@ def call(
         return_dict: Optional[bool] = None,
         training: bool = False,
     ):
-
         if training and use_mems is None:
             use_mems = self.use_mems_train
         else:
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 07362fe29c53f7..d6ed227ff886aa 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1080,7 +1080,6 @@ def forward(
         return_dict: Optional[bool] = None,
         **kwargs,  # delete after depreciation warning is removed
     ) -> Union[Tuple, XLNetModelOutput]:
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 9dc6fd24596419..5ad655c4173554 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -145,7 +145,7 @@ def __init__(
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        **kwargs
+        **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -256,7 +256,7 @@ def _decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         spaces_between_special_tokens: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index c27c5262f94c1f..3f6ff200a84ac3 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -142,7 +142,7 @@ def __init__(
         cls_token="<cls>",
         mask_token="<mask>",
         additional_special_tokens=["<eop>", "<eod>"],
-        **kwargs
+        **kwargs,
     ):
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
diff --git a/src/transformers/models/yolos/configuration_yolos.py b/src/transformers/models/yolos/configuration_yolos.py
index c6bfbff444dd4f..538f85b1eb29e8 100644
--- a/src/transformers/models/yolos/configuration_yolos.py
+++ b/src/transformers/models/yolos/configuration_yolos.py
@@ -129,7 +129,7 @@ def __init__(
         bbox_loss_coefficient=5,
         giou_loss_coefficient=2,
         eos_coefficient=0.1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
 
@@ -160,7 +160,6 @@ def __init__(
 
 
 class YolosOnnxConfig(OnnxConfig):
-
     torch_onnx_minimum_version = version.parse("1.11")
 
     @property
diff --git a/src/transformers/models/yolos/convert_yolos_to_pytorch.py b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
index d953936e24ff01..02279518832bd1 100644
--- a/src/transformers/models/yolos/convert_yolos_to_pytorch.py
+++ b/src/transformers/models/yolos/convert_yolos_to_pytorch.py
@@ -19,11 +19,11 @@
 import json
 from pathlib import Path
 
+import requests
 import torch
+from huggingface_hub import hf_hub_download
 from PIL import Image
 
-import requests
-from huggingface_hub import hf_hub_download
 from transformers import YolosConfig, YolosFeatureExtractor, YolosForObjectDetection
 from transformers.utils import logging
 
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index afe6f55bd88a71..f49d5d14fd5edd 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -698,7 +698,7 @@ def __init__(
         image_mean: Union[float, List[float]] = None,
         image_std: Union[float, List[float]] = None,
         do_pad: bool = True,
-        **kwargs
+        **kwargs,
     ) -> None:
         if "pad_and_return_pixel_mask" in kwargs:
             do_pad = kwargs.pop("pad_and_return_pixel_mask")
@@ -811,7 +811,7 @@ def resize(
         size: Dict[str, int],
         resample: PILImageResampling = PILImageResampling.BILINEAR,
         data_format: Optional[ChannelDimension] = None,
-        **kwargs
+        **kwargs,
     ) -> np.ndarray:
         """
         Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
@@ -955,7 +955,7 @@ def preprocess(
         format: Optional[Union[str, AnnotionFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
-        **kwargs
+        **kwargs,
     ) -> BatchFeature:
         """
         Preprocess an image or a batch of images so that it can be used by the model.
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index 1921b87a9fcb9c..e3cb02ceae6ec0 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -322,7 +322,6 @@ def __init__(self, config: YolosConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
 
@@ -380,7 +379,6 @@ def __init__(self, config: YolosConfig) -> None:
             self.intermediate_act_fn = config.hidden_act
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-
         hidden_states = self.dense(hidden_states)
         hidden_states = self.intermediate_act_fn(hidden_states)
 
diff --git a/src/transformers/models/yoso/configuration_yoso.py b/src/transformers/models/yoso/configuration_yoso.py
index 7a2458146c6f59..c6d2b176ef947d 100644
--- a/src/transformers/models/yoso/configuration_yoso.py
+++ b/src/transformers/models/yoso/configuration_yoso.py
@@ -120,7 +120,7 @@ def __init__(
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
diff --git a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
index 2b9a2c7cd85338..be46a4de81b30c 100644
--- a/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
+++ b/src/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
@@ -75,7 +75,6 @@ def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
 
 
 def convert_yoso_checkpoint(checkpoint_path, yoso_config_file, pytorch_dump_path):
-
     orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]
     config = YosoConfig.from_json_file(yoso_config_file)
     model = YosoForMaskedLM(config)
diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py
index df1ec304b952e2..cf0c814ff95080 100644
--- a/src/transformers/models/yoso/modeling_yoso.py
+++ b/src/transformers/models/yoso/modeling_yoso.py
@@ -98,7 +98,6 @@ def normalize(input_tensors):
 
 
 def hashing(query, key, num_hash, hash_len):
-
     if len(query.size()) != 3:
         raise ValueError("Query has incorrect size.")
     if len(key.size()) != 3:
diff --git a/src/transformers/onnx/convert.py b/src/transformers/onnx/convert.py
index e953207b3a595b..ee9c498e73ba28 100644
--- a/src/transformers/onnx/convert.py
+++ b/src/transformers/onnx/convert.py
@@ -246,9 +246,8 @@ def export_tensorflow(
         `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
         the ONNX configuration.
     """
-    import tensorflow as tf
-
     import onnx
+    import tensorflow as tf
     import tf2onnx
 
     if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 3c833875a87744..e3a2e59c37f801 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -219,7 +219,7 @@ def __init__(
         include_in_weight_decay: Optional[List[str]] = None,
         exclude_from_weight_decay: Optional[List[str]] = None,
         name: str = "AdamWeightDecay",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
         self.weight_decay_rate = weight_decay_rate
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index f3645c6cb61af4..434009d7f293ea 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -24,9 +24,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from numpy import isin
-
 from huggingface_hub import model_info
+from numpy import isin
 
 from ..configuration_utils import PretrainedConfig
 from ..dynamic_module_utils import get_class_from_dynamic_module
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index a58247d4128790..7f104e74e12824 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -15,7 +15,6 @@
 from typing import Union
 
 import numpy as np
-
 import requests
 
 from ..utils import add_end_docstrings, is_torch_available, logging
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 8c552cbdc307a5..c8833ed3bd9303 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -15,7 +15,6 @@
 from typing import TYPE_CHECKING, Dict, Optional, Union
 
 import numpy as np
-
 import requests
 
 from ..utils import is_torch_available, logging
@@ -302,7 +301,7 @@ def __init__(
         feature_extractor: Union["SequenceFeatureExtractor", str],
         *,
         decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.feature_extractor = feature_extractor
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index c4158f3cc4818c..db349944a4fda5 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -165,7 +165,6 @@ def _forward(self, model_inputs):
     def postprocess(
         self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
     ):
-
         fn = None
         if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
             fn = self.image_processor.post_process_panoptic_segmentation
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index d4bb7f2102900d..597a0980e24e05 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -307,7 +307,7 @@ def _sanitize_parameters(
         max_question_len=None,
         handle_impossible_answer=None,
         align_to_words=None,
-        **kwargs
+        **kwargs,
     ):
         # Set defaults values
         preprocess_params = {}
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 615037a6d96a91..c01d7e49053a91 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -23,7 +23,6 @@
 
 if is_tf_available() and is_tensorflow_probability_available():
     import tensorflow as tf
-
     import tensorflow_probability as tfp
 
     from ..models.auto.modeling_tf_auto import (
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index a9f73218ad54fb..bb8e860ce9aeee 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -78,7 +78,7 @@ def _sanitize_parameters(
         clean_up_tokenization_spaces=None,
         truncation=None,
         stop_sequence=None,
-        **generate_kwargs
+        **generate_kwargs,
     ):
         preprocess_params = {}
         if truncation is not None:
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index b19d58f4ffbb44..2800515c821a42 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -97,7 +97,7 @@ def _sanitize_parameters(
         prefix=None,
         handle_long_generation=None,
         stop_sequence=None,
-        **generate_kwargs
+        **generate_kwargs,
     ):
         preprocess_params = {}
         if prefix is not None:
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index e126bb7b1ea11a..5dc243d1acf276 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -22,7 +22,6 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
     """
 
     def __call__(self, inputs: Union[str, List[str]], **kwargs):
-
         if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
             inputs = list(inputs)
             batch_size = len(inputs)
@@ -141,7 +140,6 @@ def _sanitize_parameters(
         aggregation_strategy: Optional[AggregationStrategy] = None,
         offset_mapping: Optional[List[Tuple[int, int]]] = None,
     ):
-
         preprocess_params = {}
         if offset_mapping is not None:
             preprocess_params["offset_mapping"] = offset_mapping
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 785a54e7d7a3c3..41a1699a7e82cf 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -9,7 +9,6 @@
 
 if is_decord_available():
     import numpy as np
-
     from decord import VideoReader
 
 
@@ -85,7 +84,6 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
         return super().__call__(videos, **kwargs)
 
     def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
-
         if num_frames is None:
             num_frames = self.model.config.num_frames
 
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index cd4ff60c03d333..cf05999861c05f 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -66,7 +66,7 @@ def __call__(
         self,
         image: Union[str, "Image.Image", List[Dict[str, Any]]],
         candidate_labels: Union[str, List[str]] = None,
-        **kwargs
+        **kwargs,
     ):
         """
         Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
@@ -168,7 +168,6 @@ def _forward(self, model_inputs):
         return model_outputs
 
     def postprocess(self, model_outputs, threshold=0.1, top_k=None):
-
         results = []
         for model_output in model_outputs:
             label = model_output["candidate_label"]
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 3a8ee66218bf60..821fd8c545f721 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -35,6 +35,7 @@
 from unittest import mock
 
 import huggingface_hub
+
 from transformers import logging as transformers_logging
 
 from .deepspeed import is_deepspeed_available
@@ -778,6 +779,7 @@ def get_tests_dir(append_path=None):
 # The original code came from:
 # https://github.com/fastai/fastai/blob/master/tests/utils/text.py
 
+
 # When any function contains print() calls that get overwritten, like progress bars,
 # a special care needs to be applied, since under pytest -s captured output (capsys
 # or contextlib.redirect_stdout) contains any temporary printed strings, followed by
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 6d33266c03f4e7..4dbbee4144c552 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -609,7 +609,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -693,7 +693,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         def get_input_ids(text):
             if isinstance(text, str):
@@ -924,7 +924,7 @@ def _decode(
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
         spaces_between_special_tokens: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 8f3c392d382227..77a8db79520dd1 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1824,7 +1824,7 @@ def _from_pretrained(
         cache_dir=None,
         local_files_only=False,
         _commit_hash=None,
-        **kwargs
+        **kwargs,
     ):
         # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
         # file or if `from_slow` is set to True.
@@ -1932,7 +1932,6 @@ def convert_added_tokens(obj: Union[AddedToken, Any]):
 
             model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
             if model_max_length is not None and isinstance(model_max_length, (int, float)):
-
                 model_max_length = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
                 # TODO(PVP) - uncomment following line in Transformers v5
                 # init_kwargs["model_max_length"] = model_max_length
@@ -2278,7 +2277,7 @@ def encode(
         max_length: Optional[int] = None,
         stride: int = 0,
         return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs
+        **kwargs,
     ) -> List[int]:
         """
         Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
@@ -2474,7 +2473,7 @@ def __call__(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
@@ -2558,7 +2557,7 @@ def _call_one(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         # Input type checking for clearer error
         def _is_valid_text_input(t):
@@ -2671,7 +2670,7 @@ def encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences.
@@ -2743,7 +2742,7 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         raise NotImplementedError
 
@@ -2773,7 +2772,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
@@ -2846,7 +2845,7 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         raise NotImplementedError
 
@@ -3083,7 +3082,7 @@ def prepare_for_model(
         return_length: bool = False,
         verbose: bool = True,
         prepend_batch_axis: bool = False,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
@@ -3271,8 +3270,7 @@ def truncate_sequences(
                 )
                 if truncation_strategy == TruncationStrategy.ONLY_FIRST:
                     error_msg = (
-                        error_msg
-                        + "Please select another truncation strategy than "
+                        error_msg + "Please select another truncation strategy than "
                         f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
                     )
                 logger.error(error_msg)
@@ -3373,7 +3371,6 @@ def _pad(
 
             if self.padding_side == "right":
                 if return_attention_mask:
-
                     encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
                 if "token_type_ids" in encoded_inputs:
                     encoded_inputs["token_type_ids"] = (
@@ -3415,7 +3412,7 @@ def batch_decode(
         sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        **kwargs,
     ) -> List[str]:
         """
         Convert a list of lists of token ids into a list of strings by calling decode.
@@ -3448,7 +3445,7 @@ def decode(
         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         """
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
@@ -3484,7 +3481,7 @@ def _decode(
         token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         raise NotImplementedError
 
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index 4d13e34742370a..bcdbd8325bb351 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -411,7 +411,6 @@ def _batch_encode_plus(
         return_length: bool = False,
         verbose: bool = True,
     ) -> BatchEncoding:
-
         if not isinstance(batch_text_or_text_pairs, (tuple, list)):
             raise TypeError(
                 f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})"
@@ -495,9 +494,8 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs
+        **kwargs,
     ) -> BatchEncoding:
-
         batched_input = [(text, text_pair)] if text_pair else [text]
         batched_output = self._batch_encode_plus(
             batched_input,
@@ -542,7 +540,7 @@ def _decode(
         token_ids: Union[int, List[int]],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool = True,
-        **kwargs
+        **kwargs,
     ) -> str:
         self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index da140111cc2d4f..f10cc2f4b490ba 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -37,7 +37,8 @@
 
 
 # Integrations must be imported before ML frameworks:
-from .integrations import (  # isort: split
+# isort: off
+from .integrations import (
     default_hp_search_backend,
     get_reporting_integration_callbacks,
     hp_params,
@@ -52,16 +53,17 @@
     run_hp_search_wandb,
 )
 
+# isort: on
+
 import numpy as np
 import torch
 import torch.distributed as dist
+from huggingface_hub import Repository, create_repo
 from packaging import version
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 
-from huggingface_hub import Repository, create_repo
-
 from . import __version__
 from .configuration_utils import PretrainedConfig
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
@@ -1414,9 +1416,8 @@ def _wrap_model(self, model, training=True, dataloader=None):
         # Distributed training using PyTorch FSDP
         elif self.fsdp is not None:
             # PyTorch FSDP!
-            from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
+            from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, MixedPrecision
             from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-            from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
             from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
 
             if FSDPOption.OFFLOAD in self.args.fsdp:
@@ -2004,7 +2005,6 @@ def _get_output_dir(self, trial):
         return run_dir
 
     def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
-
         if model is None:
             model = self.model
 
@@ -2071,7 +2071,6 @@ def _load_best_model(self):
         model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
         if os.path.exists(best_model_path):
             if self.deepspeed:
-
                 if self.model_wrapped is not None:
                     # this removes the pre-hooks from the previous engine
                     self.model_wrapped.destroy()
@@ -2127,7 +2126,6 @@ def _load_best_model(self):
             )
 
     def _issue_warnings_after_load(self, load_result):
-
         if len(load_result.missing_keys) != 0:
             if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set(
                 self.model._keys_to_ignore_on_save
@@ -2684,7 +2682,6 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             if self.args.should_save:
                 self._save(output_dir, state_dict=state_dict)
         elif self.deepspeed:
-
             # this takes care of everything as long as we aren't under zero3
             if self.args.should_save:
                 self._save(output_dir)
@@ -2983,7 +2980,6 @@ def evaluation_loop(
 
         # if eval is called w/o train init deepspeed here
         if args.deepspeed and not self.deepspeed:
-
             # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval
             # from the checkpoint eventually
             deepspeed_engine, _, _ = deepspeed_init(
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index cabc2ee51384d9..daab54a4e85831 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -595,6 +595,7 @@ class DistributedLengthGroupedSampler(DistributedSampler):
     Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same
     length while keeping a bit of randomness.
     """
+
     # Copied and adapted from PyTorch DistributedSampler.
     def __init__(
         self,
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 1ee85ce84c8ad6..4a79516d265c1b 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -33,7 +33,7 @@ def evaluate(
         eval_dataset: Optional[Dataset] = None,
         ignore_keys: Optional[List[str]] = None,
         metric_key_prefix: str = "eval",
-        **gen_kwargs
+        **gen_kwargs,
     ) -> Dict[str, float]:
         """
         Run evaluation and returns metrics.
@@ -82,7 +82,7 @@ def predict(
         test_dataset: Dataset,
         ignore_keys: Optional[List[str]] = None,
         metric_key_prefix: str = "test",
-        **gen_kwargs
+        **gen_kwargs,
     ) -> PredictionOutput:
         """
         Run prediction and returns predictions and potential metrics.
diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py
index 737dd4deaf6887..1f6435b787a02a 100644
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -23,11 +23,14 @@
 
 
 # Integrations must be imported before ML frameworks:
-from .integrations import (  # isort: split
+# isort: off
+from .integrations import (
     is_comet_available,
     is_wandb_available,
 )
 
+# isort: on
+
 import numpy as np
 import tensorflow as tf
 from tensorflow.python.distribute.values import PerReplica
@@ -462,7 +465,6 @@ def prediction_step(
 
     @tf.function
     def distributed_prediction_steps(self, batch):
-
         nb_instances_in_batch = self._compute_nb_instances(batch)
         inputs = self._get_step_inputs(batch, nb_instances_in_batch)
 
@@ -516,7 +518,6 @@ def train(self) -> None:
             epochs_trained = 0
             steps_trained_in_current_epoch = 0
             if self.model.ckpt_manager.latest_checkpoint:
-
                 logger.info(
                     f"Checkpoint file {self.model.ckpt_manager.latest_checkpoint} found and restoring from checkpoint"
                 )
@@ -560,7 +561,6 @@ def train(self) -> None:
                     self._past = None
 
                 for step, batch in enumerate(train_ds):
-
                     # Skip past any already trained steps if resuming training
                     if steps_trained_in_current_epoch > 0:
                         steps_trained_in_current_epoch -= 1
@@ -704,7 +704,6 @@ def apply_gradients(self, features, labels, nb_instances_in_global_batch):
     @tf.function
     def distributed_training_steps(self, batch):
         with self.args.strategy.scope():
-
             nb_instances_in_batch = self._compute_nb_instances(batch)
             inputs = self._get_step_inputs(batch, nb_instances_in_batch)
 
@@ -712,7 +711,6 @@ def distributed_training_steps(self, batch):
 
     @staticmethod
     def _compute_nb_instances(batch):
-
         labels = batch[-1]
         if isinstance(labels, PerReplica):
             labels = tf.concat(labels.values, axis=0)
@@ -723,7 +721,6 @@ def _compute_nb_instances(batch):
 
     @staticmethod
     def _get_step_inputs(batch, nb_instances):
-
         features, labels = batch
 
         if isinstance(labels, PerReplica):
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index e857a260c7a991..484e8895125769 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -398,7 +398,6 @@ class TrainerMemoryTracker:
     }
 
     def __init__(self, skip_memory_metrics=False):
-
         self.skip_memory_metrics = skip_memory_metrics
 
         if not is_psutil_available():
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index d73ab249833ca0..39952073bc4664 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1165,7 +1165,6 @@ def __post_init__(self):
                 self.half_precision_backend = self.fp16_backend
 
             if self.bf16 or self.bf16_full_eval:
-
                 if self.no_cuda and not is_torch_bf16_cpu_available() and not is_torch_tpu_available():
                     # cpu
                     raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10")
diff --git a/src/transformers/utils/bitsandbytes.py b/src/transformers/utils/bitsandbytes.py
index 4e14dbaf77d332..936079f45dcbb1 100644
--- a/src/transformers/utils/bitsandbytes.py
+++ b/src/transformers/utils/bitsandbytes.py
@@ -4,11 +4,10 @@
 
 
 if is_bitsandbytes_available():
+    import bitsandbytes as bnb
     import torch
     import torch.nn as nn
 
-    import bitsandbytes as bnb
-
 if is_accelerate_available():
     from accelerate import init_empty_weights
     from accelerate.utils import find_tied_parameters
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 7e951fdb192107..339ec1fc783111 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -64,7 +64,6 @@ def _generate_supported_model_class_names(
     model_name: Type[PretrainedConfig],
     supported_tasks: Optional[Union[str, List[str]]] = None,
 ) -> List[str]:
-
     task_mapping = {
         "default": MODEL_MAPPING_NAMES,
         "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES,
@@ -692,7 +691,6 @@ class HFTracer(Tracer):
     ]
 
     def __init__(self, autowrap_modules=(math,), autowrap_functions=()):
-
         super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions)
 
         if not is_torch_fx_available():
@@ -713,7 +711,6 @@ def _generate_dummy_input(
         inputs_dict = {}
 
         if input_name in ["labels", "start_positions", "end_positions"]:
-
             batch_size = shape[0]
             if model_class_name in [
                 *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
@@ -1147,7 +1144,6 @@ def symbolic_trace(
     input_names: Optional[List[str]] = None,
     disable_check: bool = False,
 ) -> GraphModule:
-
     """
     Performs symbolic tracing on the model.
 
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index 4f8ea58d9dcaf2..bb3575edf24f98 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -48,6 +48,7 @@
     hf_raise_for_status,
 )
 from requests.exceptions import HTTPError
+
 from transformers.utils.logging import tqdm
 
 from . import __version__, logging
@@ -709,7 +710,7 @@ def push_to_hub(
         use_auth_token: Optional[Union[bool, str]] = None,
         max_shard_size: Optional[Union[int, str]] = "10GB",
         create_pr: bool = False,
-        **deprecated_kwargs
+        **deprecated_kwargs,
     ) -> str:
         """
         Upload the {object_files} to the 🤗 Model Hub while synchronizing a local clone of the repo in
diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py
index a98e2f30fd6eb8..a1cce40dc2f121 100644
--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -18,19 +18,20 @@
 import os
 import sys
 import threading
-from logging import CRITICAL  # NOQA
-from logging import DEBUG  # NOQA
-from logging import ERROR  # NOQA
-from logging import FATAL  # NOQA
-from logging import INFO  # NOQA
-from logging import NOTSET  # NOQA
-from logging import WARN  # NOQA
-from logging import WARNING  # NOQA
+from logging import (
+    CRITICAL,  # NOQA
+    DEBUG,  # NOQA
+    ERROR,  # NOQA
+    FATAL,  # NOQA
+    INFO,  # NOQA
+    NOTSET,  # NOQA
+    WARN,  # NOQA
+    WARNING,  # NOQA
+)
 from typing import Optional
 
-from tqdm import auto as tqdm_lib
-
 import huggingface_hub.utils as hf_hub_utils
+from tqdm import auto as tqdm_lib
 
 
 _lock = threading.Lock()
@@ -67,17 +68,14 @@ def _get_default_logging_level():
 
 
 def _get_library_name() -> str:
-
     return __name__.split(".")[0]
 
 
 def _get_library_root_logger() -> logging.Logger:
-
     return logging.getLogger(_get_library_name())
 
 
 def _configure_library_root_logger() -> None:
-
     global _default_handler
 
     with _lock:
@@ -95,7 +93,6 @@ def _configure_library_root_logger() -> None:
 
 
 def _reset_library_root_logger() -> None:
-
     global _default_handler
 
     with _lock:
diff --git a/src/transformers/utils/model_parallel_utils.py b/src/transformers/utils/model_parallel_utils.py
index bcbe808013596f..b5d23417cec190 100644
--- a/src/transformers/utils/model_parallel_utils.py
+++ b/src/transformers/utils/model_parallel_utils.py
@@ -33,14 +33,12 @@ def assert_device_map(device_map, num_blocks):
     if len(duplicate_blocks) != 0:
         raise ValueError(
             "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device."
-            " These attention blocks were specified more than once: "
-            + str(duplicate_blocks)
+            " These attention blocks were specified more than once: " + str(duplicate_blocks)
         )
     if len(missing_blocks) != 0:
         raise ValueError(
             "There are attention blocks for this model that are not specified in the device_map. Add these attention "
-            "blocks to a device on the device_map: "
-            + str(missing_blocks)
+            "blocks to a device on the device_map: " + str(missing_blocks)
         )
     if len(extra_blocks) != 0:
         raise ValueError(
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 65644ef3ac6ddd..eea27ba7044f15 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -21,8 +21,8 @@
 from copy import deepcopy
 
 import datasets
-
 from parameterized import parameterized
+
 from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import AutoModel, TrainingArguments, is_torch_available, logging
 from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available, unset_hf_deepspeed_config
@@ -271,7 +271,6 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
     # --- These tests are enough to run on one of zero stages --- #
 
     def test_hf_ds_config_mismatch(self):
-
         ds_config = self.get_config_dict(ZERO2)
 
         # Purposefully configure these values to mismatch TrainingArguments values.
@@ -383,7 +382,6 @@ def test_stage3_nvme_offload(self):
     @require_optuna
     def test_hyperparameter_search(self):
         with mockenv_context(**self.dist_env_1_gpu):
-
             ds_config_zero3_dict = self.get_config_dict(ZERO3)
 
             # hyperparameter_search requires model_init() to recreate the model for each trial
@@ -599,7 +597,6 @@ def test_save_checkpoints(self, stage, dtype):
 
     @parameterized.expand(params, name_func=parameterized_custom_name_func)
     def test_can_resume_training_errors(self, stage, dtype):
-
         with mockenv_context(**self.dist_env_1_gpu):
             ds_config_dict = self.get_config_dict(stage)
             output_dir = self.get_auto_remove_tmp_dir()
@@ -765,7 +762,6 @@ def test_load_best_model(self, stage, dtype):
         ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
 
         with mockenv_context(**self.dist_env_1_gpu):
-
             args_dict = {
                 "per_gpu_train_batch_size": 1,
                 "per_gpu_eval_batch_size": 1,
@@ -938,7 +934,6 @@ def test_inference(self, dtype):
         )
 
     def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
-
         if do_train:
             train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
             self.assertIn("train_samples_per_second", train_metrics)
@@ -966,7 +961,6 @@ def run_and_check(
         extra_args_str: str = None,
         remove_args_str: str = None,
     ):
-
         # we are doing quality testing so using a small real model
         output_dir = self.run_trainer(
             stage=stage,
diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py
index 6c1af5177671ed..984c7e756578b3 100644
--- a/tests/deepspeed/test_model_zoo.py
+++ b/tests/deepspeed/test_model_zoo.py
@@ -18,6 +18,7 @@
 from os.path import dirname
 
 from parameterized import parameterized
+
 from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import is_torch_available
 from transformers.testing_utils import (
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index d0fc5582160521..d86fb337af04c5 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -22,6 +22,7 @@
 from unittest.mock import patch
 
 from parameterized import parameterized
+
 from transformers.testing_utils import (
     CaptureStderr,
     ExtendSysPath,
@@ -361,7 +362,6 @@ def run_trainer(
             args += extra_args_str.split()
 
         if distributed:
-
             if n_gpus_to_use is None:
                 n_gpus_to_use = get_gpu_count()
             master_port = get_torch_dist_unique_port()
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index 2c655254f2e5ad..8add735a0bf07b 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -20,6 +20,7 @@
 from huggingface_hub import HfFolder, delete_repo, set_access_token
 from parameterized import parameterized
 from requests.exceptions import HTTPError
+
 from transformers import AutoConfig, GenerationConfig
 from transformers.testing_utils import TOKEN, USER, is_staging_test
 
diff --git a/tests/generation/test_flax_logits_process.py b/tests/generation/test_flax_logits_process.py
index 27dea2b029dd8f..a45d75ae244bb6 100644
--- a/tests/generation/test_flax_logits_process.py
+++ b/tests/generation/test_flax_logits_process.py
@@ -27,6 +27,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.generation import (
         FlaxForcedBOSTokenLogitsProcessor,
         FlaxForcedEOSTokenLogitsProcessor,
diff --git a/tests/generation/test_flax_utils.py b/tests/generation/test_flax_utils.py
index aabab559853bb0..c6182a2386e561 100644
--- a/tests/generation/test_flax_utils.py
+++ b/tests/generation/test_flax_utils.py
@@ -27,6 +27,7 @@
 
     import jax.numpy as jnp
     from jax import jit
+
     from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
     from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
 
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
index 014ed4af1e6dc1..ad1263cb938a88 100644
--- a/tests/generation/test_framework_agnostic.py
+++ b/tests/generation/test_framework_agnostic.py
@@ -8,7 +8,6 @@
 
 
 class GenerationIntegrationTestsMixin:
-
     # To be populated by the child classes
     framework_dependent_parameters = {
         "AutoModelForSeq2SeqLM": None,
diff --git a/tests/generation/test_tf_logits_process.py b/tests/generation/test_tf_logits_process.py
index 195188f10bfc64..a1f665c9a761fc 100644
--- a/tests/generation/test_tf_logits_process.py
+++ b/tests/generation/test_tf_logits_process.py
@@ -17,8 +17,8 @@
 import unittest
 
 import numpy as np
-
 from parameterized import parameterized
+
 from transformers import is_tf_available
 from transformers.testing_utils import require_tf
 
diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py
index 7d500571896373..afd4cfef4f7fe3 100644
--- a/tests/generation/test_tf_utils.py
+++ b/tests/generation/test_tf_utils.py
@@ -36,7 +36,6 @@
 
 @require_tf
 class UtilsFunctionsTest(unittest.TestCase):
-
     # tests whether the top_k_top_p_filtering function behaves as expected
     def test_top_k_top_p_filtering(self):
         logits = tf.convert_to_tensor(
@@ -133,7 +132,6 @@ def test_top_k_top_p_filtering(self):
 
 @require_tf
 class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-
     # setting framework_dependent_parameters needs to be gated, just like its contents' imports
     if is_tf_available():
         framework_dependent_parameters = {
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 5bbefef8f1ffc6..d65a34947b0a9d 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1413,7 +1413,6 @@ def test_constrained_beam_search_generate_dict_output(self):
     def test_contrastive_generate(self):
         # check `generate()` and `contrastive_search()` are equal
         for model_class in self.all_generative_model_classes:
-
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 return
@@ -1435,7 +1434,6 @@ def test_contrastive_generate(self):
 
     def test_contrastive_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
-
             # won't fix: FSMT and Reformer have a different cache variable type (and format).
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 return
@@ -1661,7 +1659,6 @@ def _check_sequence_inside_sequence(self, tensor_1, tensor_2):
 
 @require_torch
 class UtilsFunctionsTest(unittest.TestCase):
-
     # tests whether the top_k_top_p function behaves as expected
     def test_top_k_top_p_filtering(self):
         logits = torch.tensor(
@@ -1792,7 +1789,6 @@ def test_top_k_top_p_filtering_with_filter_value(self):
 
 @require_torch
 class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
-
     # setting framework_dependent_parameters needs to be gated, just like its contents' imports
     if is_torch_available():
         framework_dependent_parameters = {
diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py
index 9acb5ba99791ac..9cd62918db4e78 100644
--- a/tests/models/albert/test_modeling_albert.py
+++ b/tests/models/albert/test_modeling_albert.py
@@ -240,7 +240,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             AlbertModel,
diff --git a/tests/models/albert/test_modeling_flax_albert.py b/tests/models/albert/test_modeling_flax_albert.py
index 802952e52cbd85..5292665f55b794 100644
--- a/tests/models/albert/test_modeling_flax_albert.py
+++ b/tests/models/albert/test_modeling_flax_albert.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.albert.modeling_flax_albert import (
         FlaxAlbertForMaskedLM,
         FlaxAlbertForMultipleChoice,
@@ -117,7 +118,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxAlbertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FlaxAlbertModel,
diff --git a/tests/models/albert/test_modeling_tf_albert.py b/tests/models/albert/test_modeling_tf_albert.py
index ad10228a518225..873116c0d304fd 100644
--- a/tests/models/albert/test_modeling_tf_albert.py
+++ b/tests/models/albert/test_modeling_tf_albert.py
@@ -228,7 +228,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFAlbertModel,
diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py
index 5459917775d992..c25cfaec77b4aa 100644
--- a/tests/models/albert/test_tokenization_albert.py
+++ b/tests/models/albert/test_tokenization_albert.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = AlbertTokenizer
     rust_tokenizer_class = AlbertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py
index c22b9a468409d6..ff39329d8404d6 100755
--- a/tests/models/altclip/test_modeling_altclip.py
+++ b/tests/models/altclip/test_modeling_altclip.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import AltCLIPConfig, AltCLIPProcessor, AltCLIPTextConfig, AltCLIPVisionConfig
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -292,7 +292,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class AltCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (AltCLIPTextModel,) if is_torch_available() else ()
     fx_compatible = True
     test_pruning = False
@@ -343,7 +342,6 @@ def test_model_from_pretrained(self):
 
 class AltCLIPModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -395,7 +393,6 @@ def prepare_img():
 
 @require_torch
 class AltCLIPModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (AltCLIPModel,) if is_torch_available() else ()
     fx_compatible = True
     test_head_masking = False
diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
index cf6bb1d27f7920..6fd035af8d0471 100644
--- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py
@@ -102,7 +102,6 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = ASTFeatureExtractor
 
     def setUp(self):
diff --git a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
index 90d748ebea4a85..b0d3140de52d35 100644
--- a/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
+++ b/tests/models/audio_spectrogram_transformer/test_modeling_audio_spectrogram_transformer.py
@@ -18,6 +18,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import ASTConfig
 from transformers.testing_utils import require_torch, require_torchaudio, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_torchaudio_available
@@ -225,7 +226,6 @@ def default_feature_extractor(self):
 
     @slow
     def test_inference_audio_classification(self):
-
         feature_extractor = self.default_feature_extractor
         model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(torch_device)
 
diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py
index 0008aa101b45a2..9fb982c0f0e14e 100644
--- a/tests/models/auto/test_modeling_auto.py
+++ b/tests/models/auto/test_modeling_auto.py
@@ -42,8 +42,8 @@
 
 if is_torch_available():
     import torch
-
     from test_module.custom_modeling import CustomModel
+
     from transformers import (
         AutoConfig,
         AutoModel,
@@ -394,7 +394,6 @@ def test_cached_model_has_minimum_calls_to_head(self):
             self.assertEqual(counter.other_request_count, 0)
 
     def test_attr_not_existing(self):
-
         from transformers.models.auto.auto_factory import _LazyAutoMapping
 
         _CONFIG_MAPPING_NAMES = OrderedDict([("bert", "BertConfig")])
diff --git a/tests/models/auto/test_modeling_flax_auto.py b/tests/models/auto/test_modeling_flax_auto.py
index 26f80f97064788..5880551f54dac8 100644
--- a/tests/models/auto/test_modeling_flax_auto.py
+++ b/tests/models/auto/test_modeling_flax_auto.py
@@ -20,6 +20,7 @@
 
 if is_flax_available():
     import jax
+
     from transformers.models.auto.modeling_flax_auto import FlaxAutoModel
     from transformers.models.bert.modeling_flax_bert import FlaxBertModel
     from transformers.models.roberta.modeling_flax_roberta import FlaxRobertaModel
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index 107e3611db3fbb..a880bc0a084a5d 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -23,6 +23,7 @@
 
 from huggingface_hub import HfFolder, Repository, create_repo, delete_repo, set_access_token
 from requests.exceptions import HTTPError
+
 from transformers import (
     CONFIG_MAPPING,
     FEATURE_EXTRACTOR_MAPPING,
diff --git a/tests/models/bart/test_modeling_flax_bart.py b/tests/models/bart/test_modeling_flax_bart.py
index 7a1d1c5e8b4607..f97f49149817f0 100644
--- a/tests/models/bart/test_modeling_flax_bart.py
+++ b/tests/models/bart/test_modeling_flax_bart.py
@@ -33,6 +33,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers.models.bart.modeling_flax_bart import (
         FlaxBartForConditionalGeneration,
         FlaxBartForQuestionAnswering,
diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py
index 24ea6e1e5cd95b..5607d1d3d2e113 100644
--- a/tests/models/bart/test_tokenization_bart.py
+++ b/tests/models/bart/test_tokenization_bart.py
@@ -132,7 +132,6 @@ def test_prepare_batch_not_longer_than_maxlen(self):
 
     @require_torch
     def test_special_tokens(self):
-
         src_text = ["A long paragraph for summarization."]
         tgt_text = [
             "Summary of the text.",
diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py
index 38acf046b4f3e9..fa128f5091b9f4 100644
--- a/tests/models/barthez/test_tokenization_barthez.py
+++ b/tests/models/barthez/test_tokenization_barthez.py
@@ -25,7 +25,6 @@
 @require_sentencepiece
 @slow  # see https://github.com/huggingface/transformers/issues/11457
 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BarthezTokenizer
     rust_tokenizer_class = BarthezTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py
index fc5ebfd19c4a14..1fc06e38e04507 100644
--- a/tests/models/bartpho/test_tokenization_bartpho.py
+++ b/tests/models/bartpho/test_tokenization_bartpho.py
@@ -26,7 +26,6 @@
 
 
 class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BartphoTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/beit/test_image_processing_beit.py b/tests/models/beit/test_image_processing_beit.py
index b9b809108f3b20..dca26e049298ff 100644
--- a/tests/models/beit/test_image_processing_beit.py
+++ b/tests/models/beit/test_image_processing_beit.py
@@ -105,7 +105,6 @@ def prepare_semantic_batch_inputs():
 @require_torch
 @require_vision
 class BeitImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = BeitImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/beit/test_modeling_flax_beit.py b/tests/models/beit/test_modeling_flax_beit.py
index 94ffda61eb97f2..75c58aec471731 100644
--- a/tests/models/beit/test_modeling_flax_beit.py
+++ b/tests/models/beit/test_modeling_flax_beit.py
@@ -27,6 +27,7 @@
 
 if is_flax_available():
     import jax
+
     from transformers import FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling, FlaxBeitModel
 
 if is_vision_available():
@@ -140,7 +141,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxBeitModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (FlaxBeitModel, FlaxBeitForImageClassification, FlaxBeitForMaskedImageModeling) if is_flax_available() else ()
     )
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 367e5ee53c4026..8873ccb613665b 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -427,7 +427,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             BertModel,
@@ -565,7 +564,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # BertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == BertForMultipleChoice:
                 return
diff --git a/tests/models/bert/test_modeling_flax_bert.py b/tests/models/bert/test_modeling_flax_bert.py
index 5516c4d6fe67fe..55ffb440196f17 100644
--- a/tests/models/bert/test_modeling_flax_bert.py
+++ b/tests/models/bert/test_modeling_flax_bert.py
@@ -133,7 +133,6 @@ def prepare_config_and_inputs_for_decoder(self):
 
 @require_flax
 class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/bert/test_modeling_tf_bert.py b/tests/models/bert/test_modeling_tf_bert.py
index 451f54325d13e0..86c42c3233be95 100644
--- a/tests/models/bert/test_modeling_tf_bert.py
+++ b/tests/models/bert/test_modeling_tf_bert.py
@@ -591,7 +591,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFBertModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFBertModel,
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index dfbcd266c49917..c5ebd6dbf34a29 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -34,7 +34,6 @@
 
 @require_tokenizers
 class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertTokenizer
     rust_tokenizer_class = BertTokenizerFast
     test_rust_tokenizer = True
@@ -305,7 +304,6 @@ def test_change_tokenize_chinese_chars(self):
         text_with_chinese_char = "".join(list_of_commun_chinese_char)
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 kwargs["tokenize_chinese_chars"] = True
                 tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
diff --git a/tests/models/bert_generation/test_modeling_bert_generation.py b/tests/models/bert_generation/test_modeling_bert_generation.py
index ebd8af2bb6f319..b2a6b5060bfd34 100644
--- a/tests/models/bert_generation/test_modeling_bert_generation.py
+++ b/tests/models/bert_generation/test_modeling_bert_generation.py
@@ -241,7 +241,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BertGenerationEncoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (BertGenerationEncoder, BertGenerationDecoder) if is_torch_available() else ()
     all_generative_model_classes = (BertGenerationDecoder,) if is_torch_available() else ()
 
diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py
index 581f249db050cb..12be95d53ebd78 100644
--- a/tests/models/bert_generation/test_tokenization_bert_generation.py
+++ b/tests/models/bert_generation/test_tokenization_bert_generation.py
@@ -29,7 +29,6 @@
 
 @require_sentencepiece
 class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertGenerationTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
index 038a334cebc79d..3e840018bdc15b 100644
--- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py
+++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
@@ -36,7 +36,6 @@
 
 @custom_tokenizers
 class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertJapaneseTokenizer
     test_rust_tokenizer = False
     space_between_special_tokens = True
@@ -369,7 +368,6 @@ def test_sequence_builders(self):
 
 @custom_tokenizers
 class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertJapaneseTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/bertweet/test_tokenization_bertweet.py b/tests/models/bertweet/test_tokenization_bertweet.py
index 5f82fba516754b..2a4c643269c6da 100644
--- a/tests/models/bertweet/test_tokenization_bertweet.py
+++ b/tests/models/bertweet/test_tokenization_bertweet.py
@@ -22,7 +22,6 @@
 
 
 class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BertweetTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index ec8705607d6578..8859db16ecc913 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -430,7 +430,6 @@ def create_and_check_for_change_to_full_attn(
 
 @require_torch
 class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
-
     # head masking & pruning is currently not supported for big bird
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/big_bird/test_modeling_flax_big_bird.py b/tests/models/big_bird/test_modeling_flax_big_bird.py
index 7c4c7267216a62..997df338963927 100644
--- a/tests/models/big_bird/test_modeling_flax_big_bird.py
+++ b/tests/models/big_bird/test_modeling_flax_big_bird.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax
+
     from transformers.models.big_bird.modeling_flax_big_bird import (
         FlaxBigBirdForCausalLM,
         FlaxBigBirdForMaskedLM,
@@ -134,7 +135,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxBigBirdModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FlaxBigBirdForCausalLM,
@@ -210,7 +210,6 @@ def model_jitted(input_ids, attention_mask=None, **kwargs):
 
                 self.assertEqual(len(outputs), len(jitted_outputs))
                 for jitted_output, output in zip(jitted_outputs, outputs):
-
                     self.assertEqual(jitted_output.shape, output.shape)
 
     # overwrite from common in order to skip the check on `attentions`
diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py
index ff654510082574..fd4323cb0f57c6 100644
--- a/tests/models/big_bird/test_tokenization_big_bird.py
+++ b/tests/models/big_bird/test_tokenization_big_bird.py
@@ -30,7 +30,6 @@
 @require_sentencepiece
 @require_tokenizers
 class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BigBirdTokenizer
     rust_tokenizer_class = BigBirdTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py
index b0eb86d43fc613..14d4900a5096ee 100644
--- a/tests/models/biogpt/test_modeling_biogpt.py
+++ b/tests/models/biogpt/test_modeling_biogpt.py
@@ -263,7 +263,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BioGptModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (BioGptModel, BioGptForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (BioGptForCausalLM,) if is_torch_available() else ()
     test_pruning = False
diff --git a/tests/models/blenderbot/test_modeling_flax_blenderbot.py b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
index 70dd9c24e95cbf..771a388d4a19a0 100644
--- a/tests/models/blenderbot/test_modeling_flax_blenderbot.py
+++ b/tests/models/blenderbot/test_modeling_flax_blenderbot.py
@@ -34,6 +34,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import BlenderbotTokenizer
     from transformers.models.blenderbot.modeling_flax_blenderbot import (
         FlaxBlenderbotForConditionalGeneration,
diff --git a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
index c0d58c0d1483ff..5252b9cb986ee7 100644
--- a/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_blenderbot_small.py
@@ -297,7 +297,6 @@ def tokenizer(self):
 
     @slow
     def test_90_generation_from_long_input(self):
-
         src_text = [
             "Social anxiety\nWow, I am never shy. Do you have anxiety?\nYes. I end up sweating and blushing and feel"
             " like       i'm going to throw up.\nand why is that?"
diff --git a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
index 695eb3b30dad12..d417ac3073d50b 100644
--- a/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_modeling_flax_blenderbot_small.py
@@ -34,6 +34,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers.models.blenderbot_small.modeling_flax_blenderbot_small import (
         FlaxBlenderbotSmallForConditionalGeneration,
         FlaxBlenderbotSmallModel,
diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
index 7ea7f09b5764bf..b022e77682bdfb 100644
--- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
@@ -27,7 +27,6 @@
 
 
 class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = BlenderbotSmallTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/blip/test_image_processing_blip.py b/tests/models/blip/test_image_processing_blip.py
index f3f2ab21dfd4a8..245c722ed4031c 100644
--- a/tests/models/blip/test_image_processing_blip.py
+++ b/tests/models/blip/test_image_processing_blip.py
@@ -110,7 +110,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 @require_torch
 @require_vision
 class BlipImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = BlipImageProcessor if is_vision_available() else None
 
     def setUp(self):
@@ -232,7 +231,6 @@ def test_call_pytorch(self):
 @require_torch
 @require_vision
 class BlipImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = BlipImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index 800bd67989600d..c1c208d36bd7d6 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -301,7 +301,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (BlipTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -345,7 +344,6 @@ def test_model_from_pretrained(self):
 
 class BlipModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -523,7 +521,6 @@ def test_model_from_pretrained(self):
 
 class BlipTextRetrievalModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -571,7 +568,6 @@ def prepare_config_and_inputs_for_common(self):
 
 class BlipTextImageModelsModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py
index 2e5e37ce2e96b3..6c7df61c5ffb80 100644
--- a/tests/models/blip/test_modeling_blip_text.py
+++ b/tests/models/blip/test_modeling_blip_text.py
@@ -125,7 +125,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BlipTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (BlipTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index ee703b035ac60c..85e6312e72117b 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -320,7 +320,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class BloomModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             BloomModel,
@@ -450,7 +449,6 @@ def test_batch_generation(self):
     @slow
     @require_torch_gpu
     def test_batch_generation_padd(self):
-
         path_560m = "bigscience/bloom-560m"
         model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
         model = model.eval()
diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py
index 117240dbdae19c..88ead384e0edcb 100644
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@@ -25,7 +25,6 @@
 
 @require_tokenizers
 class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     slow_tokenizer_class = None
     rust_tokenizer_class = BloomTokenizerFast
     tokenizer_class = BloomTokenizerFast
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index 85057c5278bbbd..70dba0a781c048 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -36,7 +36,6 @@
 
 
 class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = ByT5Tokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index aff186d73cb065..6acabc7bf25dd7 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -31,7 +31,6 @@
 @require_sentencepiece
 @require_tokenizers
 class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CamembertTokenizer
     rust_tokenizer_class = CamembertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py
index cf45f10a833af8..92f27047a979c8 100644
--- a/tests/models/canine/test_modeling_canine.py
+++ b/tests/models/canine/test_modeling_canine.py
@@ -208,7 +208,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CanineModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             CanineModel,
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index 6ae27082ccebfa..a52ef3d784c80c 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -28,7 +28,6 @@
 
 
 class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CanineTokenizer
     test_rust_tokenizer = False
 
@@ -188,7 +187,6 @@ def test_added_token_serializable(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # a special token for Canine can be defined as follows:
                 NEW_TOKEN = 0xE006
                 new_token = chr(NEW_TOKEN)
@@ -262,7 +260,6 @@ def test_encode_decode_with_spaces(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 input = "hello world"
                 if self.space_between_special_tokens:
                     output = "[CLS] hello world [SEP]"
diff --git a/tests/models/chinese_clip/test_image_processing_chinese_clip.py b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
index b7b31350713a9f..90b0ffbcd6c593 100644
--- a/tests/models/chinese_clip/test_image_processing_chinese_clip.py
+++ b/tests/models/chinese_clip/test_image_processing_chinese_clip.py
@@ -114,7 +114,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 @require_torch
 @require_vision
 class ChineseCLIPImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
 
     def setUp(self):
@@ -247,7 +246,6 @@ def test_call_pytorch(self):
 @require_torch
 @require_vision
 class ChineseCLIPImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ChineseCLIPImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py
index 97e522b3b95e6f..637984264e3780 100644
--- a/tests/models/chinese_clip/test_modeling_chinese_clip.py
+++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py
@@ -20,8 +20,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -316,7 +316,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class ChineseCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (ChineseCLIPTextModel,) if is_torch_available() else ()
     fx_compatible = False
 
@@ -478,7 +477,6 @@ def test_model_from_pretrained(self):
 
 class ChineseCLIPModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
diff --git a/tests/models/clip/test_image_processing_clip.py b/tests/models/clip/test_image_processing_clip.py
index 7ffaceb54c68da..00a43a6bb43766 100644
--- a/tests/models/clip/test_image_processing_clip.py
+++ b/tests/models/clip/test_image_processing_clip.py
@@ -114,7 +114,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 @require_torch
 @require_vision
 class CLIPImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = CLIPImageProcessor if is_vision_available() else None
 
     def setUp(self):
@@ -247,7 +246,6 @@ def test_call_pytorch(self):
 @require_torch
 @require_vision
 class CLIPImageProcessingTestFourChannels(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = CLIPImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py
index 8fa3ce21bb31e3..2e80df6c27bfd0 100644
--- a/tests/models/clip/test_modeling_clip.py
+++ b/tests/models/clip/test_modeling_clip.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 import transformers
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.testing_utils import (
@@ -67,6 +67,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -347,7 +348,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (CLIPTextModel, CLIPTextModelWithProjection) if is_torch_available() else ()
     fx_compatible = True
     test_pruning = False
@@ -402,7 +402,6 @@ def test_model_with_projection_from_pretrained(self):
 
 class CLIPModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -579,7 +578,6 @@ def test_equivalence_pt_to_flax(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # load PyTorch class
                 pt_model = model_class(config).eval()
                 # Flax models don't use the `use_cache` option and cache is not returned as a default.
diff --git a/tests/models/clip/test_modeling_flax_clip.py b/tests/models/clip/test_modeling_flax_clip.py
index b8a1030ad1b0c8..7d63fa9edf1774 100644
--- a/tests/models/clip/test_modeling_flax_clip.py
+++ b/tests/models/clip/test_modeling_flax_clip.py
@@ -14,6 +14,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
diff --git a/tests/models/clip/test_modeling_tf_clip.py b/tests/models/clip/test_modeling_tf_clip.py
index 05b4c7920ebd81..88ad5be374d0e8 100644
--- a/tests/models/clip/test_modeling_tf_clip.py
+++ b/tests/models/clip/test_modeling_tf_clip.py
@@ -22,6 +22,7 @@
 from importlib import import_module
 
 import requests
+
 from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
 from transformers.testing_utils import require_tf, require_vision, slow
 from transformers.utils import is_tf_available, is_vision_available
@@ -396,7 +397,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFCLIPTextModel,) if is_tf_available() else ()
     test_pruning = False
     test_head_masking = False
diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py
index e9ba304b475dd6..fc958267105c32 100644
--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -27,7 +27,6 @@
 
 @require_tokenizers
 class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CLIPTokenizer
     rust_tokenizer_class = CLIPTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py
index f170f606533850..24a59e48dc3851 100644
--- a/tests/models/clipseg/test_modeling_clipseg.py
+++ b/tests/models/clipseg/test_modeling_clipseg.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 import transformers
 from transformers import MODEL_MAPPING, CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig
 from transformers.models.auto import get_values
@@ -60,6 +60,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -302,7 +303,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CLIPSegTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (CLIPSegTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -346,7 +346,6 @@ def test_model_from_pretrained(self):
 
 class CLIPSegModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
@@ -560,7 +559,6 @@ def test_equivalence_pt_to_flax(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # load PyTorch class
                 pt_model = model_class(config).eval()
                 # Flax models don't use the `use_cache` option and cache is not returned as a default.
diff --git a/tests/models/codegen/test_modeling_codegen.py b/tests/models/codegen/test_modeling_codegen.py
index 091a8b401d8dc9..c5818d23eac030 100644
--- a/tests/models/codegen/test_modeling_codegen.py
+++ b/tests/models/codegen/test_modeling_codegen.py
@@ -349,7 +349,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class CodeGenModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (CodeGenModel, CodeGenForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (CodeGenForCausalLM,) if is_torch_available() else ()
     fx_compatible = False
diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py
index c15c8236b8da14..ec7c11dcef9d80 100644
--- a/tests/models/codegen/test_tokenization_codegen.py
+++ b/tests/models/codegen/test_tokenization_codegen.py
@@ -28,7 +28,6 @@
 
 @require_tokenizers
 class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CodeGenTokenizer
     rust_tokenizer_class = CodeGenTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
index f93d3cabbf2f11..ba77431467b830 100644
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False):
 @require_torch
 @require_vision
 class ConditionalDetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/convbert/test_modeling_convbert.py b/tests/models/convbert/test_modeling_convbert.py
index b8ab2c64725a25..28762ebcfb4677 100644
--- a/tests/models/convbert/test_modeling_convbert.py
+++ b/tests/models/convbert/test_modeling_convbert.py
@@ -246,7 +246,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             ConvBertModel,
@@ -419,7 +418,6 @@ def test_attention_outputs(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # ConvBertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == ConvBertForMultipleChoice:
                 return
diff --git a/tests/models/convbert/test_modeling_tf_convbert.py b/tests/models/convbert/test_modeling_tf_convbert.py
index ae675b878ed145..e9d8630a598593 100644
--- a/tests/models/convbert/test_modeling_tf_convbert.py
+++ b/tests/models/convbert/test_modeling_tf_convbert.py
@@ -224,7 +224,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFConvBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFConvBertModel,
diff --git a/tests/models/convnext/test_image_processing_convnext.py b/tests/models/convnext/test_image_processing_convnext.py
index da7d28e64dbce4..ced0765c352866 100644
--- a/tests/models/convnext/test_image_processing_convnext.py
+++ b/tests/models/convnext/test_image_processing_convnext.py
@@ -77,7 +77,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class ConvNextImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ConvNextImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/ctrl/test_modeling_ctrl.py b/tests/models/ctrl/test_modeling_ctrl.py
index 0f2149ecf9be17..71d83dd3bb9e47 100644
--- a/tests/models/ctrl/test_modeling_ctrl.py
+++ b/tests/models/ctrl/test_modeling_ctrl.py
@@ -193,7 +193,6 @@ def create_and_check_ctrl_for_sequence_classification(self, config, input_ids, h
 
 @require_torch
 class CTRLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (CTRLModel, CTRLLMHeadModel, CTRLForSequenceClassification) if is_torch_available() else ()
     all_generative_model_classes = (CTRLLMHeadModel,) if is_torch_available() else ()
     test_pruning = True
diff --git a/tests/models/ctrl/test_modeling_tf_ctrl.py b/tests/models/ctrl/test_modeling_tf_ctrl.py
index d3e82e57c9f27f..5c7a92d73c49bc 100644
--- a/tests/models/ctrl/test_modeling_tf_ctrl.py
+++ b/tests/models/ctrl/test_modeling_tf_ctrl.py
@@ -169,7 +169,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFCTRLModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFCTRLModel, TFCTRLLMHeadModel, TFCTRLForSequenceClassification) if is_tf_available() else ()
     all_generative_model_classes = (TFCTRLLMHeadModel,) if is_tf_available() else ()
     test_head_masking = False
diff --git a/tests/models/ctrl/test_tokenization_ctrl.py b/tests/models/ctrl/test_tokenization_ctrl.py
index 0bd4d8c8065cba..02c3459f9e0461 100644
--- a/tests/models/ctrl/test_tokenization_ctrl.py
+++ b/tests/models/ctrl/test_tokenization_ctrl.py
@@ -23,7 +23,6 @@
 
 
 class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = CTRLTokenizer
     test_rust_tokenizer = False
     test_seq2seq = False
diff --git a/tests/models/deberta/test_modeling_deberta.py b/tests/models/deberta/test_modeling_deberta.py
index 940a82db4398a2..91f69a11b55532 100644
--- a/tests/models/deberta/test_modeling_deberta.py
+++ b/tests/models/deberta/test_modeling_deberta.py
@@ -214,7 +214,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DebertaModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             DebertaModel,
diff --git a/tests/models/deberta/test_modeling_tf_deberta.py b/tests/models/deberta/test_modeling_tf_deberta.py
index c2584db30f1976..d544a12e68c269 100644
--- a/tests/models/deberta/test_modeling_tf_deberta.py
+++ b/tests/models/deberta/test_modeling_tf_deberta.py
@@ -208,7 +208,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFDebertaModel,
diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py
index 4aa53e13ff8d4e..81d7bd95bd8081 100644
--- a/tests/models/deberta/test_tokenization_deberta.py
+++ b/tests/models/deberta/test_tokenization_deberta.py
@@ -26,7 +26,6 @@
 
 
 class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = DebertaTokenizer
     test_rust_tokenizer = True
     rust_tokenizer_class = DebertaTokenizerFast
diff --git a/tests/models/deberta_v2/test_modeling_deberta_v2.py b/tests/models/deberta_v2/test_modeling_deberta_v2.py
index 8c9bf3bbf7e5c9..ad22f4f9a9489c 100644
--- a/tests/models/deberta_v2/test_modeling_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_deberta_v2.py
@@ -227,7 +227,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DebertaV2ModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             DebertaV2Model,
diff --git a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
index b2cc8896e46ee5..bd4d05e6c13958 100644
--- a/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
+++ b/tests/models/deberta_v2/test_modeling_tf_deberta_v2.py
@@ -210,7 +210,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFDebertaV2Model,
diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
index f2831315e5c29c..961cd82f548c3c 100644
--- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py
+++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = DebertaV2Tokenizer
     rust_tokenizer_class = DebertaV2TokenizerFast
     test_sentencepiece = True
@@ -166,7 +165,6 @@ def test_do_lower_case_false_split_by_punct_false(self):
         self.assertListEqual(rust_tokens, tokens_target)
 
     def test_rust_and_python_full_tokenizers(self):
-
         tokenizer = self.get_tokenizer()
         rust_tokenizer = self.get_rust_tokenizer()
 
diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py
index ece5ac333945ea..10ca3a767df72d 100644
--- a/tests/models/decision_transformer/test_modeling_decision_transformer.py
+++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py
@@ -132,7 +132,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (DecisionTransformerModel,) if is_torch_available() else ()
     all_generative_model_classes = ()
 
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
index 98ebac4bbbb433..c0d927b9c980d4 100644
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False):
 @require_torch
 @require_vision
 class DeformableDetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/deit/test_image_processing_deit.py b/tests/models/deit/test_image_processing_deit.py
index d2919ccc2ab94b..4103fc8fb25d5e 100644
--- a/tests/models/deit/test_image_processing_deit.py
+++ b/tests/models/deit/test_image_processing_deit.py
@@ -82,7 +82,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class DeiTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = DeiTImageProcessor if is_vision_available() else None
     test_cast_dtype = True
 
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index 19858cb5b7f95d..612c6d0352b10b 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -327,7 +327,6 @@ def test_problem_types(self):
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py
index c1d26b2fdf54fb..e6e98476657746 100644
--- a/tests/models/deta/test_image_processing_deta.py
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False):
 @require_torch
 @require_vision
 class DetaImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = DetaImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
index 1638cb6794ffc3..d6354de43dbe29 100644
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False):
 @require_torch
 @require_vision
 class DetrImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = DetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index 2f87f60697b2af..00352026ecf73f 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -193,7 +193,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_natten
 @require_torch
 class DinatModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             DinatModel,
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 9b4606b484ee55..50902268fd9064 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -196,7 +196,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DistilBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             DistilBertModel,
@@ -256,7 +255,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # BertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == DistilBertForMultipleChoice:
                 return
diff --git a/tests/models/distilbert/test_modeling_flax_distilbert.py b/tests/models/distilbert/test_modeling_flax_distilbert.py
index e0f609b4ddf309..f4481a6e4a077a 100644
--- a/tests/models/distilbert/test_modeling_flax_distilbert.py
+++ b/tests/models/distilbert/test_modeling_flax_distilbert.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.distilbert.modeling_flax_distilbert import (
         FlaxDistilBertForMaskedLM,
         FlaxDistilBertForMultipleChoice,
@@ -111,7 +112,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxDistilBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FlaxDistilBertModel,
diff --git a/tests/models/distilbert/test_modeling_tf_distilbert.py b/tests/models/distilbert/test_modeling_tf_distilbert.py
index e52532d5618aae..a59401dc506b1c 100644
--- a/tests/models/distilbert/test_modeling_tf_distilbert.py
+++ b/tests/models/distilbert/test_modeling_tf_distilbert.py
@@ -170,7 +170,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFDistilBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFDistilBertModel,
diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py
index 7b2c97d78a0fb6..09422395720988 100644
--- a/tests/models/distilbert/test_tokenization_distilbert.py
+++ b/tests/models/distilbert/test_tokenization_distilbert.py
@@ -22,7 +22,6 @@
 
 @require_tokenizers
 class DistilBertTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DistilBertTokenizer
     rust_tokenizer_class = DistilBertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/donut/test_image_processing_donut.py b/tests/models/donut/test_image_processing_donut.py
index 5ac4d1fc3d1b7f..bd992626494aa6 100644
--- a/tests/models/donut/test_image_processing_donut.py
+++ b/tests/models/donut/test_image_processing_donut.py
@@ -82,7 +82,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class DonutImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = DonutImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index a35a65505981f1..6a0809587c8a99 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -144,7 +144,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DonutSwinModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (DonutSwinModel,) if is_torch_available() else ()
     fx_compatible = True
 
diff --git a/tests/models/dpr/test_modeling_dpr.py b/tests/models/dpr/test_modeling_dpr.py
index 708f1d53c3a46e..482caa054707c7 100644
--- a/tests/models/dpr/test_modeling_dpr.py
+++ b/tests/models/dpr/test_modeling_dpr.py
@@ -179,7 +179,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class DPRModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             DPRContextEncoder,
diff --git a/tests/models/dpr/test_modeling_tf_dpr.py b/tests/models/dpr/test_modeling_tf_dpr.py
index 86ef3837f1fa01..4910252278fe5c 100644
--- a/tests/models/dpr/test_modeling_tf_dpr.py
+++ b/tests/models/dpr/test_modeling_tf_dpr.py
@@ -172,7 +172,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFDPRModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFDPRContextEncoder,
diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py
index 8ad2fea09c8bd7..db41052d4cd0e2 100644
--- a/tests/models/dpr/test_tokenization_dpr.py
+++ b/tests/models/dpr/test_tokenization_dpr.py
@@ -30,7 +30,6 @@
 
 @require_tokenizers
 class DPRContextEncoderTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DPRContextEncoderTokenizer
     rust_tokenizer_class = DPRContextEncoderTokenizerFast
     test_rust_tokenizer = True
@@ -38,7 +37,6 @@ class DPRContextEncoderTokenizationTest(BertTokenizationTest):
 
 @require_tokenizers
 class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DPRQuestionEncoderTokenizer
     rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
     test_rust_tokenizer = True
@@ -46,7 +44,6 @@ class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
 
 @require_tokenizers
 class DPRReaderTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = DPRReaderTokenizer
     rust_tokenizer_class = DPRReaderTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/dpt/test_image_processing_dpt.py b/tests/models/dpt/test_image_processing_dpt.py
index 4ed6faadb6a8e0..4cde4cbe73f03a 100644
--- a/tests/models/dpt/test_image_processing_dpt.py
+++ b/tests/models/dpt/test_image_processing_dpt.py
@@ -74,7 +74,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class DPTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = DPTImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/efficientformer/test_image_processing_efficientformer.py b/tests/models/efficientformer/test_image_processing_efficientformer.py
index 6a17783f61d108..ff2fcafd4b7b60 100644
--- a/tests/models/efficientformer/test_image_processing_efficientformer.py
+++ b/tests/models/efficientformer/test_image_processing_efficientformer.py
@@ -74,7 +74,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class EfficientFormerImageProcessorTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ViTImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py
index 4426b37e9bbf9c..4627c545ea7f2a 100644
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
@@ -325,7 +325,6 @@ def test_problem_types(self):
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py
index bdc0715f745813..7311df1e78ea3e 100644
--- a/tests/models/electra/test_modeling_electra.py
+++ b/tests/models/electra/test_modeling_electra.py
@@ -375,7 +375,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             ElectraModel,
diff --git a/tests/models/electra/test_modeling_flax_electra.py b/tests/models/electra/test_modeling_flax_electra.py
index cd1a795a19efec..0dda4e38fddaf7 100644
--- a/tests/models/electra/test_modeling_flax_electra.py
+++ b/tests/models/electra/test_modeling_flax_electra.py
@@ -105,7 +105,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxElectraModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/electra/test_modeling_tf_electra.py b/tests/models/electra/test_modeling_tf_electra.py
index 0c0c4f77ab3245..a63f0087876851 100644
--- a/tests/models/electra/test_modeling_tf_electra.py
+++ b/tests/models/electra/test_modeling_tf_electra.py
@@ -488,7 +488,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFElectraModel,
diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
index 8f565aec061038..c476744057e89d 100644
--- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py
@@ -72,7 +72,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -106,7 +106,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -167,7 +167,7 @@ def check_encoder_decoder_model_from_pretrained_using_model_paths(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname:
@@ -210,7 +210,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -240,7 +240,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -281,7 +281,7 @@ def check_save_and_load_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -327,7 +327,7 @@ def check_encoder_decoder_model_labels(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -395,7 +395,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -424,7 +424,7 @@ def check_encoder_decoder_model_output_attentions_from_config(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the
         # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
@@ -491,7 +491,7 @@ def create_and_check_encoder_decoder_shared_weights(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         torch.manual_seed(0)
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
diff --git a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
index 9d807e9f650e70..362a5f74a1b6ad 100644
--- a/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_flax_encoder_decoder.py
@@ -69,7 +69,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -102,7 +102,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -131,7 +131,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
@@ -170,7 +170,7 @@ def check_encoder_decoder_model_from_encoder_decoder_pretrained(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         # assert that model attributes match those of configs
@@ -215,7 +215,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -292,7 +292,6 @@ def check_encoder_decoder_model_generate(self, input_ids, config, decoder_config
         self.assertEqual(generated_sequences.shape, (input_ids.shape[0],) + (decoder_config.max_length,))
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -334,7 +333,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5)
 
     def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = EncoderDecoderModel(encoder_decoder_config)
@@ -346,7 +344,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = EncoderDecoderModel(encoder_decoder_config)
@@ -390,7 +387,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         config = config_inputs_dict.pop("config")
         decoder_config = config_inputs_dict.pop("decoder_config")
@@ -589,7 +585,6 @@ def get_from_encoderdecoder_pretrained_model(self):
         return FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
 
     def _check_configuration_tie(self, model):
-
         module = model.module.bind(model.params)
 
         assert id(module.decoder.config) == id(model.config.decoder)
diff --git a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
index d719d6f4c48697..76ebd687f77edf 100644
--- a/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
+++ b/tests/models/encoder_decoder/test_modeling_tf_encoder_decoder.py
@@ -78,7 +78,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = EncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -111,7 +111,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -160,7 +160,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -190,7 +190,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -231,7 +231,7 @@ def check_encoder_decoder_model_labels(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -298,7 +298,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -326,7 +326,7 @@ def check_encoder_decoder_model_output_attentions_from_config(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # Similar to `check_encoder_decoder_model_output_attentions`, but with `output_attentions` triggered from the
         # config file. Contrarily to most models, changing the model's config won't work -- the defaults are loaded
@@ -470,7 +470,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-
         pt_inputs_dict = {}
         for name, key in tf_inputs_dict.items():
             if type(key) == bool:
@@ -490,7 +489,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
         return pt_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -607,7 +605,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         labels = config_inputs_dict.pop("decoder_token_labels")
 
@@ -762,7 +759,6 @@ def prepare_config_and_inputs(self):
     @slow
     @is_pt_tf_cross_test
     def test_bert2bert_summarization(self):
-
         from transformers import EncoderDecoderModel
 
         tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
@@ -863,7 +859,6 @@ def prepare_config_and_inputs(self):
     @slow
     @is_pt_tf_cross_test
     def test_bert2gpt2_summarization(self):
-
         from transformers import EncoderDecoderModel
 
         tokenizer_in = AutoTokenizer.from_pretrained("bert-base-cased")
@@ -1171,7 +1166,6 @@ def test_encoder_decoder_from_pretrained(self):
         decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
-
             # Since most of HF's models don't have pretrained cross-attention layers, they are randomly
             # initialized even if we create models using `from_pretrained` method.
             # For the tests, the decoder need to be a model with pretrained cross-attention layers.
diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py
index 8db290880edd91..d6700da7c9ff99 100644
--- a/tests/models/esm/test_modeling_esm.py
+++ b/tests/models/esm/test_modeling_esm.py
@@ -166,7 +166,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class EsmModelTest(ModelTesterMixin, unittest.TestCase):
-
     test_mismatched_shapes = False
 
     all_model_classes = (
diff --git a/tests/models/esm/test_modeling_esmfold.py b/tests/models/esm/test_modeling_esmfold.py
index ed307beef1ee97..a0d4232d6d8f59 100644
--- a/tests/models/esm/test_modeling_esmfold.py
+++ b/tests/models/esm/test_modeling_esmfold.py
@@ -144,7 +144,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class EsmFoldModelTest(ModelTesterMixin, unittest.TestCase):
-
     test_mismatched_shapes = False
 
     all_model_classes = (EsmForProteinFolding,) if is_torch_available() else ()
diff --git a/tests/models/esm/test_modeling_tf_esm.py b/tests/models/esm/test_modeling_tf_esm.py
index c6db0fe99f642f..0e81efd458cbfc 100644
--- a/tests/models/esm/test_modeling_tf_esm.py
+++ b/tests/models/esm/test_modeling_tf_esm.py
@@ -195,7 +195,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFEsmModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFEsmModel,
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index 2cd204ebc3ddf0..b792f14e357a25 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -363,7 +363,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FlaubertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FlaubertModel,
@@ -439,7 +438,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # FlauBertForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == FlaubertForMultipleChoice:
                 return
diff --git a/tests/models/flaubert/test_modeling_tf_flaubert.py b/tests/models/flaubert/test_modeling_tf_flaubert.py
index 09ba6f45d8d0b9..584f904ce71ddf 100644
--- a/tests/models/flaubert/test_modeling_tf_flaubert.py
+++ b/tests/models/flaubert/test_modeling_tf_flaubert.py
@@ -275,7 +275,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFFlaubertModel,
diff --git a/tests/models/flava/test_image_processing_flava.py b/tests/models/flava/test_image_processing_flava.py
index 129343b998c83f..f9751725697e0a 100644
--- a/tests/models/flava/test_image_processing_flava.py
+++ b/tests/models/flava/test_image_processing_flava.py
@@ -160,7 +160,6 @@ def get_expected_codebook_image_size(self):
 @require_torch
 @require_vision
 class FlavaImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = FlavaImageProcessor if is_vision_available() else None
     maxDiff = None
 
diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py
index 44aff1025f2e61..7af98e21fd790f 100644
--- a/tests/models/flava/test_modeling_flava.py
+++ b/tests/models/flava/test_modeling_flava.py
@@ -22,8 +22,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import (
     FlavaConfig,
     FlavaImageCodebookConfig,
@@ -435,7 +435,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FlavaTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlavaTextModel,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -569,7 +568,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FlavaMultimodalModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlavaMultimodalModel,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -667,7 +665,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FlavaImageCodebookTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlavaImageCodebook,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -756,7 +753,6 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-12,
     ):
-
         if text_kwargs is None:
             text_kwargs = {}
         if image_kwargs is None:
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index 5d975b061f75f3..03c1edd4ca1a3e 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -266,7 +266,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class FNetModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             FNetModel,
diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py
index 0058155bdb6d3e..17fe3e0dd30820 100644
--- a/tests/models/fnet/test_tokenization_fnet.py
+++ b/tests/models/fnet/test_tokenization_fnet.py
@@ -28,7 +28,6 @@
 @require_sentencepiece
 @require_tokenizers
 class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = FNetTokenizer
     rust_tokenizer_class = FNetTokenizerFast
     test_rust_tokenizer = True
@@ -145,7 +144,6 @@ def test_sequence_builders(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
diff --git a/tests/models/fsmt/test_modeling_fsmt.py b/tests/models/fsmt/test_modeling_fsmt.py
index 7710152634ea46..e5526209bb22a9 100644
--- a/tests/models/fsmt/test_modeling_fsmt.py
+++ b/tests/models/fsmt/test_modeling_fsmt.py
@@ -17,8 +17,8 @@
 import unittest
 
 import timeout_decorator  # noqa
-
 from parameterized import parameterized
+
 from transformers import FSMTConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from transformers.utils import cached_property
@@ -528,7 +528,6 @@ def test_odd_embed_dim(self):
 
     @unittest.skip("different from marian (needs more research)")
     def test_positional_emb_weights_against_marian(self):
-
         desired_weights = torch.tensor(
             [
                 [0, 0, 0, 0, 0],
diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py
index e46928a538fdf9..6c5eb87db17ce5 100644
--- a/tests/models/funnel/test_tokenization_funnel.py
+++ b/tests/models/funnel/test_tokenization_funnel.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = FunnelTokenizer
     rust_tokenizer_class = FunnelTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 40e435056b1955..b3b34f664d3c7a 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -17,6 +17,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import GitConfig, GitProcessor, GitVisionConfig, is_torch_available, is_vision_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -360,7 +361,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GitModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
     fx_compatible = False
diff --git a/tests/models/glpn/test_image_processing_glpn.py b/tests/models/glpn/test_image_processing_glpn.py
index cddc80d9ae2b22..dddc2807bc046d 100644
--- a/tests/models/glpn/test_image_processing_glpn.py
+++ b/tests/models/glpn/test_image_processing_glpn.py
@@ -67,7 +67,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class GLPNImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = GLPNImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index 7d34a7f4f30dbd..233e8ce7313bef 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -144,7 +144,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GLPNModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (GLPNModel, GLPNForDepthEstimation) if is_torch_available() else ()
 
     test_head_masking = False
diff --git a/tests/models/gpt2/test_modeling_flax_gpt2.py b/tests/models/gpt2/test_modeling_flax_gpt2.py
index 23ff3f11b0aee5..e842bbc73268d9 100644
--- a/tests/models/gpt2/test_modeling_flax_gpt2.py
+++ b/tests/models/gpt2/test_modeling_flax_gpt2.py
@@ -29,6 +29,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -189,7 +190,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_flax
 class FlaxGPT2ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxGPT2Model, FlaxGPT2LMHeadModel) if is_flax_available() else ()
     all_generative_model_classes = (FlaxGPT2LMHeadModel,) if is_flax_available() else ()
 
diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py
index 2f6f8d12143d57..c9a4cd771dc621 100644
--- a/tests/models/gpt2/test_modeling_gpt2.py
+++ b/tests/models/gpt2/test_modeling_gpt2.py
@@ -430,7 +430,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GPT2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, GPT2ForTokenClassification)
         if is_torch_available()
diff --git a/tests/models/gpt2/test_modeling_tf_gpt2.py b/tests/models/gpt2/test_modeling_tf_gpt2.py
index 64cbea4de97797..298d501e586644 100644
--- a/tests/models/gpt2/test_modeling_tf_gpt2.py
+++ b/tests/models/gpt2/test_modeling_tf_gpt2.py
@@ -355,7 +355,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFGPT2ModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (TFGPT2Model, TFGPT2LMHeadModel, TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel)
         if is_tf_available()
@@ -439,7 +438,6 @@ def test_onnx_runtime_optimize(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-
             # Skip these 2 classes which uses `tf.gather` with `batch_dims=1`
             if model_class in [TFGPT2ForSequenceClassification, TFGPT2DoubleHeadsModel]:
                 continue
diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py
index 3273fbfce77378..17fbe51713c8ac 100644
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@@ -27,7 +27,6 @@
 
 @require_tokenizers
 class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = GPT2Tokenizer
     rust_tokenizer_class = GPT2TokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/gpt2/test_tokenization_gpt2_tf.py b/tests/models/gpt2/test_tokenization_gpt2_tf.py
index 117f959831a77d..e92c9e65dfd38d 100644
--- a/tests/models/gpt2/test_tokenization_gpt2_tf.py
+++ b/tests/models/gpt2/test_tokenization_gpt2_tf.py
@@ -28,7 +28,6 @@ def __init__(self, tokenizer):
 
         @tf.function(input_signature=(tf.TensorSpec((None,), tf.string, name="text"),))
         def serving(self, text):
-
             tokenized = self.tokenizer(text)
             input_ids_dense = tokenized["input_ids"].to_tensor()
 
diff --git a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
index 706b7c6cabaf3d..a32f35f6e747b1 100644
--- a/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_flax_gpt_neo.py
@@ -29,6 +29,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -181,7 +182,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_flax
 class FlaxGPTNeoModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxGPTNeoModel, FlaxGPTNeoForCausalLM) if is_flax_available() else ()
     all_generative_model_classes = (FlaxGPTNeoForCausalLM,) if is_flax_available() else ()
 
diff --git a/tests/models/gpt_neo/test_modeling_gpt_neo.py b/tests/models/gpt_neo/test_modeling_gpt_neo.py
index 534c29b82bd4b6..53cc45aac55119 100644
--- a/tests/models/gpt_neo/test_modeling_gpt_neo.py
+++ b/tests/models/gpt_neo/test_modeling_gpt_neo.py
@@ -372,7 +372,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GPTNeoModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (GPTNeoModel, GPTNeoForCausalLM, GPTNeoForSequenceClassification) if is_torch_available() else ()
     )
diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py
index 0435624f6f1122..d57ce69f1f0fb1 100644
--- a/tests/models/gpt_neox/test_modeling_gpt_neox.py
+++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py
@@ -186,7 +186,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GPTNeoXModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (GPTNeoXModel, GPTNeoXForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GPTNeoXForCausalLM,) if is_torch_available() else ()
     test_pruning = False
diff --git a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
index 32f118ba06066b..56a5f92a8322f4 100644
--- a/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_modeling_gpt_neox_japanese.py
@@ -190,7 +190,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GPTNeoXModelJapaneseTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (GPTNeoXJapaneseModel, GPTNeoXJapaneseForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GPTNeoXJapaneseForCausalLM,) if is_torch_available() else ()
     test_pruning = False
diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
index 4af4da30a7b5e9..293116a24e33bb 100644
--- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
@@ -29,7 +29,6 @@
 
 @require_tokenizers
 class GPTNeoXJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = GPTNeoXJapaneseTokenizer
     test_rust_tokenizer = False
     from_pretrained_kwargs = {"do_clean_text": False, "add_prefix_space": False}
diff --git a/tests/models/gptj/test_modeling_flax_gptj.py b/tests/models/gptj/test_modeling_flax_gptj.py
index 9a6472bc92ee3f..d177e345e88eca 100644
--- a/tests/models/gptj/test_modeling_flax_gptj.py
+++ b/tests/models/gptj/test_modeling_flax_gptj.py
@@ -29,6 +29,7 @@
 if is_flax_available():
     import jax
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -178,7 +179,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_flax
 class FlaxGPTJModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxGPTJModel, FlaxGPTJForCausalLM) if is_flax_available() else ()
     all_generative_model_classes = (FlaxGPTJForCausalLM,) if is_flax_available() else ()
 
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index bb20c8cee6fa73..3cdb5a4b6fb916 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -361,7 +361,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (GPTJModel, GPTJForCausalLM, GPTJForSequenceClassification, GPTJForQuestionAnswering)
         if is_torch_available()
diff --git a/tests/models/gptj/test_modeling_tf_gptj.py b/tests/models/gptj/test_modeling_tf_gptj.py
index ec6c15d3f6d640..6557428c07df7c 100644
--- a/tests/models/gptj/test_modeling_tf_gptj.py
+++ b/tests/models/gptj/test_modeling_tf_gptj.py
@@ -287,7 +287,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFGPTJModelTest(TFModelTesterMixin, TFCoreModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (TFGPTJForCausalLM, TFGPTJForSequenceClassification, TFGPTJForQuestionAnswering, TFGPTJModel)
         if is_tf_available()
diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py
index 3b396daa677eb2..cd17e4d9185186 100644
--- a/tests/models/groupvit/test_modeling_groupvit.py
+++ b/tests/models/groupvit/test_modeling_groupvit.py
@@ -22,8 +22,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
 from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -432,7 +432,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class GroupViTTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (GroupViTTextModel,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
@@ -475,7 +474,6 @@ def test_model_from_pretrained(self):
 
 class GroupViTModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
diff --git a/tests/models/groupvit/test_modeling_tf_groupvit.py b/tests/models/groupvit/test_modeling_tf_groupvit.py
index 45bc8b8ec3b0c8..6283ab8988d540 100644
--- a/tests/models/groupvit/test_modeling_tf_groupvit.py
+++ b/tests/models/groupvit/test_modeling_tf_groupvit.py
@@ -23,8 +23,8 @@
 from importlib import import_module
 
 import numpy as np
-
 import requests
+
 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
 from transformers.testing_utils import (
     is_pt_tf_cross_test,
@@ -96,7 +96,6 @@ def __init__(
         self.seq_length = num_patches
 
     def prepare_config_and_inputs(self):
-
         rng = random.Random(0)
         pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size], rng=rng)
         config = self.get_config()
@@ -452,7 +451,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFGroupViTTextModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFGroupViTTextModel,) if is_tf_available() else ()
     test_pruning = False
     test_head_masking = False
diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py
index 3e8d3ac6ea2993..1afea16bdd28c2 100644
--- a/tests/models/herbert/test_tokenization_herbert.py
+++ b/tests/models/herbert/test_tokenization_herbert.py
@@ -27,7 +27,6 @@
 
 @require_tokenizers
 class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = HerbertTokenizer
     rust_tokenizer_class = HerbertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py
index d37679831d0f8e..084a4001100cd4 100644
--- a/tests/models/hubert/test_modeling_tf_hubert.py
+++ b/tests/models/hubert/test_modeling_tf_hubert.py
@@ -219,7 +219,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFHubertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFHubertModel, TFHubertForCTC) if is_tf_available() else ()
     test_resize_embeddings = False
     test_head_masking = False
diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py
index c8ca026688b6ae..d797d6aa285a6f 100644
--- a/tests/models/ibert/test_modeling_ibert.py
+++ b/tests/models/ibert/test_modeling_ibert.py
@@ -225,7 +225,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class IBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     test_pruning = False
     test_torchscript = False
     test_head_masking = False
diff --git a/tests/models/imagegpt/test_image_processing_imagegpt.py b/tests/models/imagegpt/test_image_processing_imagegpt.py
index efc456e9987735..b0a2d5ceb0a3bd 100644
--- a/tests/models/imagegpt/test_image_processing_imagegpt.py
+++ b/tests/models/imagegpt/test_image_processing_imagegpt.py
@@ -79,7 +79,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class ImageGPTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ImageGPTImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py
index 88e1e76c450802..c60282aff64393 100644
--- a/tests/models/imagegpt/test_modeling_imagegpt.py
+++ b/tests/models/imagegpt/test_modeling_imagegpt.py
@@ -265,7 +265,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class ImageGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (ImageGPTForCausalImageModeling, ImageGPTForImageClassification, ImageGPTModel) if is_torch_available() else ()
     )
diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py
index 16cacab88c8614..087c3aa27a13c7 100644
--- a/tests/models/layoutlm/test_modeling_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_layoutlm.py
@@ -220,7 +220,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class LayoutLMModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             LayoutLMModel,
diff --git a/tests/models/layoutlm/test_modeling_tf_layoutlm.py b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
index 7bcf6e590b9857..71963e69c84df1 100644
--- a/tests/models/layoutlm/test_modeling_tf_layoutlm.py
+++ b/tests/models/layoutlm/test_modeling_tf_layoutlm.py
@@ -207,7 +207,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFLayoutLMModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFLayoutLMModel,
diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py
index 3663355ee50717..b73b2aa8e44658 100644
--- a/tests/models/layoutlm/test_tokenization_layoutlm.py
+++ b/tests/models/layoutlm/test_tokenization_layoutlm.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = LayoutLMTokenizer
     rust_tokenizer_class = LayoutLMTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
index d2eae1d8df36a5..52bb80e14c98d5 100644
--- a/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_image_processing_layoutlmv2.py
@@ -64,7 +64,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_pytesseract
 class LayoutLMv2ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = LayoutLMv2ImageProcessor if is_pytesseract_available() else None
 
     def setUp(self):
diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
index 3c38373163e496..a4578d35345f4d 100644
--- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py
@@ -258,7 +258,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 @require_detectron2
 class LayoutLMv2ModelTest(ModelTesterMixin, unittest.TestCase):
-
     test_pruning = False
     test_torchscript = True
     test_mismatched_shapes = False
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index e48272c151fee7..9224fbd87ea49c 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -291,7 +291,6 @@ def test_add_special_tokens(self):
         tokenizers: List[LayoutLMv2Tokenizer] = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 special_token = "[SPECIAL_TOKEN]"
                 special_token_box = [1000, 1000, 1000, 1000]
 
@@ -526,7 +525,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1195,7 +1193,6 @@ def test_token_type_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1283,7 +1280,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -1551,7 +1547,6 @@ def test_sequence_ids(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -1787,7 +1782,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -2398,7 +2392,6 @@ def test_only_label_first_subword(self):
 
     @slow
     def test_layoutlmv2_integration_test(self):
-
         tokenizer_p = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
         tokenizer_r = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
diff --git a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
index c61d52b65a9014..8827cdeea2335f 100644
--- a/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_image_processing_layoutlmv3.py
@@ -64,7 +64,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_pytesseract
 class LayoutLMv3ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = LayoutLMv3ImageProcessor if is_pytesseract_available() else None
 
     def setUp(self):
diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
index d5c8d42d22177a..601eeb58468572 100644
--- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py
@@ -270,7 +270,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class LayoutLMv3ModelTest(ModelTesterMixin, unittest.TestCase):
-
     test_pruning = False
     test_torchscript = False
     test_mismatched_shapes = False
diff --git a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
index f71aeb0aefb4df..39de55efadf4d7 100644
--- a/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_modeling_tf_layoutlmv3.py
@@ -264,7 +264,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFLayoutLMv3ModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFLayoutLMv3Model,
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 322ed6861ff693..884f87680353ac 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -171,7 +171,6 @@ def test_add_special_tokens(self):
         tokenizers: List[LayoutLMv3Tokenizer] = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 special_token = "[SPECIAL_TOKEN]"
                 special_token_box = [1000, 1000, 1000, 1000]
 
@@ -406,7 +405,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1075,7 +1073,6 @@ def test_token_type_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1161,7 +1158,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -1429,7 +1425,6 @@ def test_sequence_ids(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -1665,7 +1660,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -1756,7 +1750,6 @@ def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20,
         return words, boxes, output_ids
 
     def test_added_token_with_space_before(self):
-
         tokenizer_s = self.get_tokenizer()
         tokenizer_f = self.get_rust_tokenizer()
 
@@ -2316,7 +2309,6 @@ def test_only_label_first_subword(self):
 
     @slow
     def test_layoutlmv3_integration_test(self):
-
         tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
         tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
 
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index e74dfe496c1c71..bf295c9c925e0e 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -194,7 +194,6 @@ def test_add_special_tokens(self):
         tokenizers: List[LayoutXLMTokenizer] = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 special_token = "[SPECIAL_TOKEN]"
                 special_token_box = [1000, 1000, 1000, 1000]
 
@@ -425,7 +424,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1098,7 +1096,6 @@ def test_token_type_ids(self):
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 # test 1: single sequence
                 words, boxes = self.get_words_and_boxes()
 
@@ -1185,7 +1182,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -1448,7 +1444,6 @@ def test_sequence_ids(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -1684,7 +1679,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -1853,7 +1847,6 @@ def test_only_label_first_subword(self):
 
     @slow
     def test_layoutxlm_integration_test(self):
-
         tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
         tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
 
diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py
index 2c761ad17a9a9c..7ff81749946aa4 100644
--- a/tests/models/led/test_tokenization_led.py
+++ b/tests/models/led/test_tokenization_led.py
@@ -128,7 +128,6 @@ def test_prepare_batch_not_longer_than_maxlen(self):
 
     @require_torch
     def test_special_tokens(self):
-
         src_text = ["A long paragraph for summarization."]
         tgt_text = [
             "Summary of the text.",
diff --git a/tests/models/levit/test_image_processing_levit.py b/tests/models/levit/test_image_processing_levit.py
index 8fba9a5d03f8e9..12d64c81771550 100644
--- a/tests/models/levit/test_image_processing_levit.py
+++ b/tests/models/levit/test_image_processing_levit.py
@@ -81,7 +81,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class LevitImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = LevitImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 2b3436f3d05a50..1bc8eb4f6a55aa 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -337,7 +337,6 @@ def test_training_gradient_checkpointing(self):
             loss.backward()
 
     def test_problem_types(self):
-
         parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
         if parsed_torch_version_base.base_version.startswith("1.9"):
             self.skipTest(reason="This test fails with PyTorch 1.9.x: some CUDA issue")
@@ -362,7 +361,6 @@ def test_problem_types(self):
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index a4f189fc848a58..4eaff42684a2fc 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -219,7 +219,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             LiltModel,
diff --git a/tests/models/longformer/test_modeling_tf_longformer.py b/tests/models/longformer/test_modeling_tf_longformer.py
index 60a8ce01f4af45..ea3ba37407a34a 100644
--- a/tests/models/longformer/test_modeling_tf_longformer.py
+++ b/tests/models/longformer/test_modeling_tf_longformer.py
@@ -271,7 +271,6 @@ def prepare_config_and_inputs_for_question_answering(self):
 
 @require_tf
 class TFLongformerModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFLongformerModel,
diff --git a/tests/models/longt5/test_modeling_flax_longt5.py b/tests/models/longt5/test_modeling_flax_longt5.py
index 1ad9c1c5ce5a3b..2c262bef3092cd 100644
--- a/tests/models/longt5/test_modeling_flax_longt5.py
+++ b/tests/models/longt5/test_modeling_flax_longt5.py
@@ -45,6 +45,7 @@
     import jax.numpy as jnp
     from flax.core.frozen_dict import unfreeze
     from flax.traverse_util import flatten_dict
+
     from transformers import FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING, FLAX_MODEL_MAPPING, AutoTokenizer, LongT5Config
     from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
     from transformers.models.longt5.modeling_flax_longt5 import (
@@ -82,7 +83,6 @@ def __init__(
         scope=None,
         decoder_layers=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -236,7 +236,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxLongT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxLongT5Model, FlaxLongT5ForConditionalGeneration) if is_flax_available() else ()
     all_generative_model_classes = (FlaxLongT5ForConditionalGeneration,) if is_flax_available() else ()
     is_encoder_decoder = True
diff --git a/tests/models/longt5/test_modeling_longt5.py b/tests/models/longt5/test_modeling_longt5.py
index ffc67376f862ef..ee7ef89e80fe29 100644
--- a/tests/models/longt5/test_modeling_longt5.py
+++ b/tests/models/longt5/test_modeling_longt5.py
@@ -71,7 +71,6 @@ def __init__(
         decoder_layers=None,
         large_model_config_path="google/long-t5-local-large",
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -502,7 +501,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class LongT5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (LongT5Model, LongT5ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (LongT5ForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
@@ -919,7 +917,6 @@ def __init__(
         scope=None,
         large_model_config_path="google/long-t5-local-large",
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py
index 789988d5ca353a..a1b0093cc4861e 100644
--- a/tests/models/luke/test_modeling_luke.py
+++ b/tests/models/luke/test_modeling_luke.py
@@ -586,7 +586,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class LukeModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             LukeModel,
diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py
index 1c51d02e96b714..f57a339e5c344b 100644
--- a/tests/models/lxmert/test_modeling_lxmert.py
+++ b/tests/models/lxmert/test_modeling_lxmert.py
@@ -129,7 +129,6 @@ def __init__(
         self.num_hidden_layers = {"vision": r_layers, "cross_encoder": x_layers, "language": l_layers}
 
     def prepare_config_and_inputs(self):
-
         output_attentions = self.output_attentions
         input_ids = ids_tensor([self.batch_size, self.seq_length], vocab_size=self.vocab_size)
         visual_feats = torch.rand(self.batch_size, self.num_visual_features, self.visual_feat_dim, device=torch_device)
@@ -412,7 +411,6 @@ def resize_lxmert_num_qa_labels(
         ans,
         output_attentions,
     ):
-
         start_labels = config.num_qa_labels
         num_large_labels = config.num_qa_labels * 2
         num_small_labels = int(config.num_qa_labels * 2)
@@ -532,7 +530,6 @@ def prepare_config_and_inputs_for_common(self, return_obj_labels=False):
 
 @require_torch
 class LxmertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (LxmertModel, LxmertForPreTraining, LxmertForQuestionAnswering) if is_torch_available() else ()
 
     fx_compatible = True
@@ -741,7 +738,6 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(attentions_vision.grad)
 
     def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
-
         tf_inputs_dict = {}
         for key, value in pt_inputs_dict.items():
             # skip key that does not exist in tf
diff --git a/tests/models/lxmert/test_modeling_tf_lxmert.py b/tests/models/lxmert/test_modeling_tf_lxmert.py
index 73eda47eb9508e..4fcc0867b0c6d2 100644
--- a/tests/models/lxmert/test_modeling_tf_lxmert.py
+++ b/tests/models/lxmert/test_modeling_tf_lxmert.py
@@ -364,7 +364,6 @@ def create_and_check_lxmert_for_pretraining(
 
 @require_tf
 class TFLxmertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFLxmertModel, TFLxmertForPreTraining) if is_tf_available() else ()
     test_head_masking = False
     test_onnx = False
@@ -493,7 +492,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
 
         pt_inputs_dict = {}
         for key, value in tf_inputs_dict.items():
-
             if isinstance(value, dict):
                 pt_inputs_dict[key] = self.prepare_pt_inputs_from_tf_inputs(value)
             elif isinstance(value, (list, tuple)):
diff --git a/tests/models/lxmert/test_tokenization_lxmert.py b/tests/models/lxmert/test_tokenization_lxmert.py
index 76047b1f44bccb..e094427f76135c 100644
--- a/tests/models/lxmert/test_tokenization_lxmert.py
+++ b/tests/models/lxmert/test_tokenization_lxmert.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = LxmertTokenizer
     rust_tokenizer_class = LxmertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py
index f8c5f5b7badd40..626ef294125839 100644
--- a/tests/models/m2m_100/test_tokenization_m2m_100.py
+++ b/tests/models/m2m_100/test_tokenization_m2m_100.py
@@ -30,7 +30,7 @@
 
 
 if is_sentencepiece_available():
-    from transformers.models.m2m_100.tokenization_m2m_100 import save_json, VOCAB_FILES_NAMES
+    from transformers.models.m2m_100.tokenization_m2m_100 import VOCAB_FILES_NAMES, save_json
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
diff --git a/tests/models/marian/test_modeling_flax_marian.py b/tests/models/marian/test_modeling_flax_marian.py
index 14d8dbac8f2de3..6510c0d732d318 100644
--- a/tests/models/marian/test_modeling_flax_marian.py
+++ b/tests/models/marian/test_modeling_flax_marian.py
@@ -35,6 +35,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import MarianTokenizer
     from transformers.models.marian.modeling_flax_marian import FlaxMarianModel, FlaxMarianMTModel, shift_tokens_right
 
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index b1e4678e4ab138..3f8be1a0a75d07 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -18,6 +18,7 @@
 import unittest
 
 from huggingface_hub.hf_api import list_models
+
 from transformers import MarianConfig, is_torch_available
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 from transformers.utils import cached_property
diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py
index 6a079036bb6d5d..fae0edfa6896c3 100644
--- a/tests/models/marian/test_tokenization_marian.py
+++ b/tests/models/marian/test_tokenization_marian.py
@@ -45,7 +45,6 @@
 
 @require_sentencepiece
 class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = MarianTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py
index 983b4fa7e6db0c..e21c8e67709446 100644
--- a/tests/models/mask2former/test_image_processing_mask2former.py
+++ b/tests/models/mask2former/test_image_processing_mask2former.py
@@ -18,8 +18,8 @@
 
 import numpy as np
 from datasets import load_dataset
-
 from huggingface_hub import hf_hub_download
+
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -131,7 +131,6 @@ def get_fake_mask2former_outputs(self):
 @require_torch
 @require_vision
 class Mask2FormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = Mask2FormerImageProcessor if (is_vision_available() and is_torch_available()) else None
 
     def setUp(self):
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index 3a8645f64a88f5..4d7409697e0236 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -173,7 +173,6 @@ def comm_check_on_output(result):
 
 @require_torch
 class Mask2FormerModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (Mask2FormerModel, Mask2FormerForUniversalSegmentation) if is_torch_available() else ()
 
     is_encoder_decoder = False
diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py
index 2d7b710b59da7f..694029603bf7d8 100644
--- a/tests/models/maskformer/test_image_processing_maskformer.py
+++ b/tests/models/maskformer/test_image_processing_maskformer.py
@@ -18,8 +18,8 @@
 
 import numpy as np
 from datasets import load_dataset
-
 from huggingface_hub import hf_hub_download
+
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -131,7 +131,6 @@ def get_fake_maskformer_outputs(self):
 @require_torch
 @require_vision
 class MaskFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = MaskFormerImageProcessor if (is_vision_available() and is_torch_available()) else None
 
     def setUp(self):
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index 52c811591bba80..a5f53d9f6fed10 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -173,7 +173,6 @@ def comm_check_on_output(result):
 
 @require_torch
 class MaskFormerModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (MaskFormerModel, MaskFormerForInstanceSegmentation) if is_torch_available() else ()
 
     is_encoder_decoder = False
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index 3d4de9c8e69167..3b606188a1b820 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -163,7 +163,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class MaskFormerSwinModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             MaskFormerSwinModel,
diff --git a/tests/models/mbart/test_modeling_flax_mbart.py b/tests/models/mbart/test_modeling_flax_mbart.py
index 1be81583575fb9..a642b2344c9e1e 100644
--- a/tests/models/mbart/test_modeling_flax_mbart.py
+++ b/tests/models/mbart/test_modeling_flax_mbart.py
@@ -35,6 +35,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import AutoTokenizer
     from transformers.models.mbart.modeling_flax_mbart import (
         FlaxMBartForConditionalGeneration,
diff --git a/tests/models/mctct/test_feature_extraction_mctct.py b/tests/models/mctct/test_feature_extraction_mctct.py
index e0c77ad450fde6..29b0cf899ad38a 100644
--- a/tests/models/mctct/test_feature_extraction_mctct.py
+++ b/tests/models/mctct/test_feature_extraction_mctct.py
@@ -104,7 +104,6 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None
 
     def setUp(self):
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index 4ea3ddcb7be006..2a0c2f6ead8263 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -267,7 +267,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             MegatronBertModel,
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index 04301962c3cdad..45b0942e2b72ed 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -254,7 +254,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             MobileBertModel,
diff --git a/tests/models/mobilebert/test_modeling_tf_mobilebert.py b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
index 75334e2945091e..40586d20019abf 100644
--- a/tests/models/mobilebert/test_modeling_tf_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_tf_mobilebert.py
@@ -42,7 +42,6 @@
 
 @require_tf
 class TFMobileBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFMobileBertModel,
diff --git a/tests/models/mobilebert/test_tokenization_mobilebert.py b/tests/models/mobilebert/test_tokenization_mobilebert.py
index 395f4a2aab2cb9..3ecc2e3238d512 100644
--- a/tests/models/mobilebert/test_tokenization_mobilebert.py
+++ b/tests/models/mobilebert/test_tokenization_mobilebert.py
@@ -35,7 +35,6 @@
 # Copied from transformers.tests.models.bert.test_modeling_bert.py with Bert->MobileBert and pathfix
 @require_tokenizers
 class MobileBERTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = MobileBertTokenizer
     rust_tokenizer_class = MobileBertTokenizerFast
     test_rust_tokenizer = True
@@ -312,7 +311,6 @@ def test_change_tokenize_chinese_chars(self):
         text_with_chinese_char = "".join(list_of_commun_chinese_char)
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 kwargs["tokenize_chinese_chars"] = True
                 tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
diff --git a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
index 34096ff1f960ab..51ca6f3b17c476 100644
--- a/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_image_processing_mobilenet_v1.py
@@ -72,7 +72,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class MobileNetV1ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = MobileNetV1ImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
index 472280753e9cc8..d5f148b21cd665 100644
--- a/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_image_processing_mobilenet_v2.py
@@ -72,7 +72,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class MobileNetV2ImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = MobileNetV2ImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/mobilevit/test_image_processing_mobilevit.py b/tests/models/mobilevit/test_image_processing_mobilevit.py
index d14a405715017a..fbc72a2d9e00ce 100644
--- a/tests/models/mobilevit/test_image_processing_mobilevit.py
+++ b/tests/models/mobilevit/test_image_processing_mobilevit.py
@@ -75,7 +75,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class MobileViTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = MobileViTImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py
index 1e72870fdaddf1..1be05033806942 100644
--- a/tests/models/mpnet/test_modeling_mpnet.py
+++ b/tests/models/mpnet/test_modeling_mpnet.py
@@ -191,7 +191,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class MPNetModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             MPNetForMaskedLM,
diff --git a/tests/models/mpnet/test_modeling_tf_mpnet.py b/tests/models/mpnet/test_modeling_tf_mpnet.py
index a0a4964d57e95a..3688e106cc664d 100644
--- a/tests/models/mpnet/test_modeling_tf_mpnet.py
+++ b/tests/models/mpnet/test_modeling_tf_mpnet.py
@@ -185,7 +185,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFMPNetModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFMPNetForMaskedLM,
diff --git a/tests/models/mpnet/test_tokenization_mpnet.py b/tests/models/mpnet/test_tokenization_mpnet.py
index f761b0280953de..e30dd3a9145e26 100644
--- a/tests/models/mpnet/test_tokenization_mpnet.py
+++ b/tests/models/mpnet/test_tokenization_mpnet.py
@@ -26,7 +26,6 @@
 
 @require_tokenizers
 class MPNetTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = MPNetTokenizer
     rust_tokenizer_class = MPNetTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/mt5/test_modeling_flax_mt5.py b/tests/models/mt5/test_modeling_flax_mt5.py
index f9ef2d5e18478a..34a5731fd059f3 100644
--- a/tests/models/mt5/test_modeling_flax_mt5.py
+++ b/tests/models/mt5/test_modeling_flax_mt5.py
@@ -21,6 +21,7 @@
 if is_flax_available():
     import optax
     from flax.training.common_utils import onehot
+
     from transformers import AutoTokenizer, FlaxMT5ForConditionalGeneration
     from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
 
diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py
index 71e83fba0e16ff..8bddb8443b642f 100644
--- a/tests/models/mvp/test_tokenization_mvp.py
+++ b/tests/models/mvp/test_tokenization_mvp.py
@@ -132,7 +132,6 @@ def test_prepare_batch_not_longer_than_maxlen(self):
 
     @require_torch
     def test_special_tokens(self):
-
         src_text = ["A long paragraph for summarization."]
         tgt_text = [
             "Summary of the text.",
diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py
index b89d4c1bb75a8b..1c74fe84b044e7 100644
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
@@ -190,7 +190,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_natten
 @require_torch
 class NatModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             NatModel,
diff --git a/tests/models/nezha/test_modeling_nezha.py b/tests/models/nezha/test_modeling_nezha.py
index 6c91d8e7fb18a5..31eb9e1e9b11b9 100644
--- a/tests/models/nezha/test_modeling_nezha.py
+++ b/tests/models/nezha/test_modeling_nezha.py
@@ -316,7 +316,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class NezhaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             NezhaModel,
@@ -429,7 +428,6 @@ def test_model_from_pretrained(self):
     def test_torchscript_device_change(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         for model_class in self.all_model_classes:
-
             # NezhaForMultipleChoice behaves incorrectly in JIT environments.
             if model_class == NezhaForMultipleChoice:
                 return
diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py
index d77b101fa76608..01d0420180c8ac 100644
--- a/tests/models/nllb/test_tokenization_nllb.py
+++ b/tests/models/nllb/test_tokenization_nllb.py
@@ -258,7 +258,6 @@ def test_save_slow_from_fast_and_reload_fast(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py
index b93c074bf68377..43879571064996 100644
--- a/tests/models/nystromformer/test_modeling_nystromformer.py
+++ b/tests/models/nystromformer/test_modeling_nystromformer.py
@@ -217,7 +217,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class NystromformerModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             NystromformerModel,
diff --git a/tests/models/oneformer/test_image_processing_oneformer.py b/tests/models/oneformer/test_image_processing_oneformer.py
index ac12447a36341e..8dec5c407083b9 100644
--- a/tests/models/oneformer/test_image_processing_oneformer.py
+++ b/tests/models/oneformer/test_image_processing_oneformer.py
@@ -18,8 +18,8 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
diff --git a/tests/models/oneformer/test_processor_oneformer.py b/tests/models/oneformer/test_processor_oneformer.py
index 5056d682832d95..5ce677cba6c6f5 100644
--- a/tests/models/oneformer/test_processor_oneformer.py
+++ b/tests/models/oneformer/test_processor_oneformer.py
@@ -21,8 +21,8 @@
 
 import numpy as np
 from datasets import load_dataset
-
 from huggingface_hub import hf_hub_download
+
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
diff --git a/tests/models/openai/test_modeling_openai.py b/tests/models/openai/test_modeling_openai.py
index 6c91808421f4b8..e525c297410797 100644
--- a/tests/models/openai/test_modeling_openai.py
+++ b/tests/models/openai/test_modeling_openai.py
@@ -190,7 +190,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class OpenAIGPTModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, OpenAIGPTForSequenceClassification)
         if is_torch_available()
diff --git a/tests/models/openai/test_modeling_tf_openai.py b/tests/models/openai/test_modeling_tf_openai.py
index 7cdc2a8bb1879b..3ce67e058a469d 100644
--- a/tests/models/openai/test_modeling_tf_openai.py
+++ b/tests/models/openai/test_modeling_tf_openai.py
@@ -192,7 +192,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFOpenAIGPTModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TFOpenAIGPTDoubleHeadsModel, TFOpenAIGPTForSequenceClassification)
         if is_tf_available()
diff --git a/tests/models/opt/test_modeling_flax_opt.py b/tests/models/opt/test_modeling_flax_opt.py
index 04d1f75db1c74a..ef94633f22a87e 100644
--- a/tests/models/opt/test_modeling_flax_opt.py
+++ b/tests/models/opt/test_modeling_flax_opt.py
@@ -33,6 +33,7 @@
 
     import jax
     import jax.numpy as jnp
+
     from transformers import FlaxOPTForCausalLM, FlaxOPTModel, GPT2Tokenizer
 
 
diff --git a/tests/models/owlvit/test_image_processing_owlvit.py b/tests/models/owlvit/test_image_processing_owlvit.py
index b94120f563da1a..5a0afa382652eb 100644
--- a/tests/models/owlvit/test_image_processing_owlvit.py
+++ b/tests/models/owlvit/test_image_processing_owlvit.py
@@ -82,7 +82,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class OwlViTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = OwlViTImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py
index 957533980151ad..bd83d7e682961b 100644
--- a/tests/models/owlvit/test_modeling_owlvit.py
+++ b/tests/models/owlvit/test_modeling_owlvit.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 import requests
+
 from transformers import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig
 from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -293,7 +293,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class OwlViTTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (OwlViTTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -339,7 +338,6 @@ def test_model_from_pretrained(self):
 
 class OwlViTModelTester:
     def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
diff --git a/tests/models/pegasus/test_modeling_flax_pegasus.py b/tests/models/pegasus/test_modeling_flax_pegasus.py
index 61c356bfb0ced3..fbc49c78112bf9 100644
--- a/tests/models/pegasus/test_modeling_flax_pegasus.py
+++ b/tests/models/pegasus/test_modeling_flax_pegasus.py
@@ -30,10 +30,10 @@
     # This makes JAX allocate exactly what is needed on demand, and deallocate memory that is no longer needed
     # but will be slower as stated here https://jax.readthedocs.io/en/latest/gpu_memory_allocation.html
     os.environ["XLA_PYTHON_CLIENT_ALLOCATOR"] = "platform"
-    import numpy as np
-
     import jax
     import jax.numpy as jnp
+    import numpy as np
+
     from transformers import FlaxPegasusForConditionalGeneration, FlaxPegasusModel
 
 
diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py
index de2886a5e12081..8f554a411e7d12 100644
--- a/tests/models/pegasus/test_tokenization_pegasus.py
+++ b/tests/models/pegasus/test_tokenization_pegasus.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PegasusTokenizer
     rust_tokenizer_class = PegasusTokenizerFast
     test_rust_tokenizer = True
@@ -134,7 +133,6 @@ def test_tokenizer_integration(self):
 @require_sentencepiece
 @require_tokenizers
 class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PegasusTokenizer
     rust_tokenizer_class = PegasusTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/perceiver/test_modeling_perceiver.py b/tests/models/perceiver/test_modeling_perceiver.py
index 5f69b9ff69eec0..f07b8746768ee1 100644
--- a/tests/models/perceiver/test_modeling_perceiver.py
+++ b/tests/models/perceiver/test_modeling_perceiver.py
@@ -263,7 +263,6 @@ def prepare_config_and_inputs_for_model_class(self, model_class):
 
 @require_torch
 class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             PerceiverModel,
@@ -739,7 +738,6 @@ def test_problem_types(self):
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
@@ -849,7 +847,6 @@ def extract_image_patches(x, kernel, stride=1, dilation=1):
 class PerceiverModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-
         tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
         model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver")
         model.to(torch_device)
@@ -884,7 +881,6 @@ def test_inference_masked_lm(self):
 
     @slow
     def test_inference_image_classification(self):
-
         feature_extractor = PerceiverFeatureExtractor()
         model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
         model.to(torch_device)
@@ -909,7 +905,6 @@ def test_inference_image_classification(self):
 
     @slow
     def test_inference_image_classification_fourier(self):
-
         feature_extractor = PerceiverFeatureExtractor()
         model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")
         model.to(torch_device)
@@ -934,7 +929,6 @@ def test_inference_image_classification_fourier(self):
 
     @slow
     def test_inference_image_classification_conv(self):
-
         feature_extractor = PerceiverFeatureExtractor()
         model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")
         model.to(torch_device)
diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py
index 3c7a67bcd2b916..197ab6d5bfa209 100644
--- a/tests/models/perceiver/test_tokenization_perceiver.py
+++ b/tests/models/perceiver/test_tokenization_perceiver.py
@@ -36,7 +36,6 @@
 
 
 class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PerceiverTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/phobert/test_tokenization_phobert.py b/tests/models/phobert/test_tokenization_phobert.py
index de16c154c92524..6624957531b07c 100644
--- a/tests/models/phobert/test_tokenization_phobert.py
+++ b/tests/models/phobert/test_tokenization_phobert.py
@@ -22,7 +22,6 @@
 
 
 class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = PhobertTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/poolformer/test_image_processing_poolformer.py b/tests/models/poolformer/test_image_processing_poolformer.py
index d5596a55a9f258..b6078fb5c5a424 100644
--- a/tests/models/poolformer/test_image_processing_poolformer.py
+++ b/tests/models/poolformer/test_image_processing_poolformer.py
@@ -78,7 +78,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class PoolFormerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = PoolFormerImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py
index 9bb8fa2e29cd34..a7869cbf7477a2 100644
--- a/tests/models/poolformer/test_modeling_poolformer.py
+++ b/tests/models/poolformer/test_modeling_poolformer.py
@@ -122,7 +122,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class PoolFormerModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (PoolFormerModel, PoolFormerForImageClassification) if is_torch_available() else ()
 
     test_head_masking = False
diff --git a/tests/models/prophetnet/test_modeling_prophetnet.py b/tests/models/prophetnet/test_modeling_prophetnet.py
index 9258d797884b0b..86cc8f3896e376 100644
--- a/tests/models/prophetnet/test_modeling_prophetnet.py
+++ b/tests/models/prophetnet/test_modeling_prophetnet.py
@@ -70,7 +70,6 @@ def __init__(
         disable_ngram_loss=False,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
diff --git a/tests/models/prophetnet/test_tokenization_prophetnet.py b/tests/models/prophetnet/test_tokenization_prophetnet.py
index 8d95eb310025d1..cf4317b3a669b6 100644
--- a/tests/models/prophetnet/test_tokenization_prophetnet.py
+++ b/tests/models/prophetnet/test_tokenization_prophetnet.py
@@ -32,7 +32,6 @@
 
 
 class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = ProphetNetTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/qdqbert/test_modeling_qdqbert.py b/tests/models/qdqbert/test_modeling_qdqbert.py
index 82bf5e3e336457..66e864a6be8d02 100644
--- a/tests/models/qdqbert/test_modeling_qdqbert.py
+++ b/tests/models/qdqbert/test_modeling_qdqbert.py
@@ -420,7 +420,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 @require_pytorch_quantization
 class QDQBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             QDQBertModel,
diff --git a/tests/models/rag/test_modeling_rag.py b/tests/models/rag/test_modeling_rag.py
index 80819663a107d1..48c7099620f308 100644
--- a/tests/models/rag/test_modeling_rag.py
+++ b/tests/models/rag/test_modeling_rag.py
@@ -48,10 +48,10 @@
 
 T5_SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 if is_torch_available() and is_datasets_available() and is_faiss_available():
+    import faiss
     import torch
     from datasets import Dataset
 
-    import faiss
     from transformers import (
         AutoConfig,
         AutoModel,
@@ -99,7 +99,6 @@ def require_retrieval(test_case):
 @require_retrieval
 @require_sentencepiece
 class RagTestMixin:
-
     all_model_classes = (
         (RagModel, RagTokenForGeneration, RagSequenceForGeneration)
         if is_torch_available() and is_datasets_available() and is_faiss_available()
@@ -493,7 +492,7 @@ def check_model_with_mismatch_n_docs_value(
         decoder_attention_mask,
         retriever_n_docs,
         generator_n_docs,
-        **kwargs
+        **kwargs,
     ):
         self.assertIsNotNone(config.question_encoder)
         self.assertIsNotNone(config.generator)
diff --git a/tests/models/rag/test_modeling_tf_rag.py b/tests/models/rag/test_modeling_tf_rag.py
index 314ce099baf65f..541d7cd132714e 100644
--- a/tests/models/rag/test_modeling_tf_rag.py
+++ b/tests/models/rag/test_modeling_tf_rag.py
@@ -16,9 +16,9 @@
 
 
 if is_tf_available() and is_datasets_available() and is_faiss_available():
+    import faiss
     import tensorflow as tf
     from datasets import Dataset
-    import faiss
 
     from transformers import (
         AutoConfig,
@@ -31,7 +31,6 @@
         TFRagSequenceForGeneration,
         TFRagTokenForGeneration,
     )
-
     from transformers.modeling_tf_outputs import TFBaseModelOutput
 
 from ..bart.test_modeling_tf_bart import TFBartModelTester
@@ -58,7 +57,6 @@ def require_retrieval(test_case):
 @require_retrieval
 @require_sentencepiece
 class TFRagTestMixin:
-
     all_model_classes = (
         (TFRagModel, TFRagTokenForGeneration, TFRagSequenceForGeneration)
         if is_tf_available() and is_datasets_available() and is_faiss_available()
@@ -392,7 +390,7 @@ def check_model_with_mismatch_n_docs_value(
         decoder_attention_mask,
         retriever_n_docs,
         generator_n_docs,
-        **kwargs
+        **kwargs,
     ):
         self.assertIsNotNone(config.question_encoder)
         self.assertIsNotNone(config.generator)
diff --git a/tests/models/rag/test_retrieval_rag.py b/tests/models/rag/test_retrieval_rag.py
index c6c1e11360f8b5..d4c119815c96f4 100644
--- a/tests/models/rag/test_retrieval_rag.py
+++ b/tests/models/rag/test_retrieval_rag.py
@@ -360,7 +360,6 @@ def test_hf_index_retriever_call(self):
     @require_tokenizers
     @require_sentencepiece
     def test_custom_hf_index_end2end_retriever_call(self):
-
         context_encoder_tokenizer = self.get_dpr_ctx_encoder_tokenizer()
         n_docs = 1
         retriever = self.get_dummy_custom_hf_index_retriever(from_disk=False)
diff --git a/tests/models/rag/test_tokenization_rag.py b/tests/models/rag/test_tokenization_rag.py
index ae9909248471be..3ac5b0efe02ea6 100644
--- a/tests/models/rag/test_tokenization_rag.py
+++ b/tests/models/rag/test_tokenization_rag.py
@@ -110,7 +110,6 @@ def tearDown(self):
 
     @require_tokenizers
     def test_save_load_pretrained_with_saved_config(self):
-
         save_dir = os.path.join(self.tmpdirname, "rag_tokenizer")
         rag_config = RagConfig(question_encoder=DPRConfig().to_dict(), generator=BartConfig().to_dict())
         rag_tokenizer = RagTokenizer(question_encoder=self.get_dpr_tokenizer(), generator=self.get_bart_tokenizer())
diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py
index c087a4dd0d3844..3fa993285dcecc 100644
--- a/tests/models/realm/test_modeling_realm.py
+++ b/tests/models/realm/test_modeling_realm.py
@@ -304,7 +304,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class RealmModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             RealmEmbedder,
diff --git a/tests/models/realm/test_tokenization_realm.py b/tests/models/realm/test_tokenization_realm.py
index 2a065ceee66af6..6a5a3878fd4354 100644
--- a/tests/models/realm/test_tokenization_realm.py
+++ b/tests/models/realm/test_tokenization_realm.py
@@ -33,7 +33,6 @@
 
 @require_tokenizers
 class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = RealmTokenizer
     rust_tokenizer_class = RealmTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/reformer/test_tokenization_reformer.py b/tests/models/reformer/test_tokenization_reformer.py
index 37ea66847f2d0c..a2a0db6c370580 100644
--- a/tests/models/reformer/test_tokenization_reformer.py
+++ b/tests/models/reformer/test_tokenization_reformer.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = ReformerTokenizer
     rust_tokenizer_class = ReformerTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py
index a3ffd6dfd5a164..b431aff86e0983 100644
--- a/tests/models/rembert/test_modeling_rembert.py
+++ b/tests/models/rembert/test_modeling_rembert.py
@@ -361,7 +361,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class RemBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             RemBertModel,
diff --git a/tests/models/rembert/test_modeling_tf_rembert.py b/tests/models/rembert/test_modeling_tf_rembert.py
index 6d4cf0a523b933..11a7b54497b98b 100644
--- a/tests/models/rembert/test_modeling_tf_rembert.py
+++ b/tests/models/rembert/test_modeling_tf_rembert.py
@@ -571,7 +571,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFRemBertModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFRemBertModel,
diff --git a/tests/models/retribert/test_tokenization_retribert.py b/tests/models/retribert/test_tokenization_retribert.py
index e2bf4e61b1ac09..25b3df6f3e34ea 100644
--- a/tests/models/retribert/test_tokenization_retribert.py
+++ b/tests/models/retribert/test_tokenization_retribert.py
@@ -35,7 +35,6 @@
 # Copied from transformers.tests.bert.test_modeling_bert.py with Bert->RetriBert
 @require_tokenizers
 class RetriBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = RetriBertTokenizer
     test_slow_tokenizer = True
     rust_tokenizer_class = RetriBertTokenizerFast
@@ -307,7 +306,6 @@ def test_change_tokenize_chinese_chars(self):
         text_with_chinese_char = "".join(list_of_commun_chinese_char)
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 kwargs["tokenize_chinese_chars"] = True
                 tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
@@ -352,7 +350,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
diff --git a/tests/models/roberta/test_modeling_flax_roberta.py b/tests/models/roberta/test_modeling_flax_roberta.py
index 5bd8a56022ce8c..c325e295f610d1 100644
--- a/tests/models/roberta/test_modeling_flax_roberta.py
+++ b/tests/models/roberta/test_modeling_flax_roberta.py
@@ -132,7 +132,6 @@ def prepare_config_and_inputs_for_decoder(self):
 
 @require_flax
 class FlaxRobertaModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py
index 5128789d41a5d7..f9df989d9bb211 100644
--- a/tests/models/roberta/test_modeling_roberta.py
+++ b/tests/models/roberta/test_modeling_roberta.py
@@ -367,7 +367,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             RobertaForCausalLM,
diff --git a/tests/models/roberta/test_modeling_tf_roberta.py b/tests/models/roberta/test_modeling_tf_roberta.py
index f9408b84171d3c..28166171e11678 100644
--- a/tests/models/roberta/test_modeling_tf_roberta.py
+++ b/tests/models/roberta/test_modeling_tf_roberta.py
@@ -548,7 +548,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFRobertaModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFRobertaModel,
diff --git a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
index 357c05fb02a13f..3f15ca9ff3af1b 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_flax_roberta_prelayernorm.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.roberta_prelayernorm.modeling_flax_roberta_prelayernorm import (
         FlaxRobertaPreLayerNormForCausalLM,
         FlaxRobertaPreLayerNormForMaskedLM,
@@ -135,7 +136,6 @@ def prepare_config_and_inputs_for_decoder(self):
 @require_flax
 # Copied from tests.models.roberta.test_modelling_flax_roberta.FlaxRobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta-base->andreasmadsen/efficient_mlm_m0.40
 class FlaxRobertaPreLayerNormModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
index 971f87c4ee634b..d32a5e959d7b9e 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py
@@ -366,7 +366,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 # Copied from tests.models.roberta.test_modelling_roberta.RobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm
 class RobertaPreLayerNormModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             RobertaPreLayerNormForCausalLM,
diff --git a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
index a7263218709e73..1580f27e6f0863 100644
--- a/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
+++ b/tests/models/roberta_prelayernorm/test_modeling_tf_roberta_prelayernorm.py
@@ -550,7 +550,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_tf
 # Copied from tests.models.roberta.test_modelling_tf_roberta.TFRobertaPreLayerNormModelTest with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm
 class TFRobertaPreLayerNormModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFRobertaPreLayerNormModel,
diff --git a/tests/models/roformer/test_modeling_flax_roformer.py b/tests/models/roformer/test_modeling_flax_roformer.py
index d45c08efdbb394..28d0ffba95a407 100644
--- a/tests/models/roformer/test_modeling_flax_roformer.py
+++ b/tests/models/roformer/test_modeling_flax_roformer.py
@@ -24,6 +24,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.models.roformer.modeling_flax_roformer import (
         FlaxRoFormerForMaskedLM,
         FlaxRoFormerForMultipleChoice,
@@ -116,7 +117,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxRoFormerModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     test_head_masking = True
 
     all_model_classes = (
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index dadb0d8e747b6b..acd6ec885bd55e 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -361,7 +361,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class RoFormerModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             RoFormerModel,
@@ -491,7 +490,6 @@ def test_basic(self):
         )
 
     def test_positional_emb_weights_against_roformer(self):
-
         desired_weights = torch.tensor(
             [
                 [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
diff --git a/tests/models/roformer/test_modeling_tf_roformer.py b/tests/models/roformer/test_modeling_tf_roformer.py
index d32d30ae8ad92c..a0fba3a6806f81 100644
--- a/tests/models/roformer/test_modeling_tf_roformer.py
+++ b/tests/models/roformer/test_modeling_tf_roformer.py
@@ -240,7 +240,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFRoFormerModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFRoFormerModel,
@@ -344,7 +343,6 @@ def test_basic(self):
         tf.debugging.assert_near(emb, desired_weights, atol=self.tolerance)
 
     def test_positional_emb_weights_against_roformer(self):
-
         desired_weights = tf.constant(
             [
                 [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py
index 7546bc2e41ddd9..2d674100f02bfc 100644
--- a/tests/models/roformer/test_tokenization_roformer.py
+++ b/tests/models/roformer/test_tokenization_roformer.py
@@ -24,7 +24,6 @@
 @require_rjieba
 @require_tokenizers
 class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = RoFormerTokenizer
     rust_tokenizer_class = RoFormerTokenizerFast
     space_between_special_tokens = True
diff --git a/tests/models/segformer/test_image_processing_segformer.py b/tests/models/segformer/test_image_processing_segformer.py
index 2be0228d65a5f6..5559fae83b082d 100644
--- a/tests/models/segformer/test_image_processing_segformer.py
+++ b/tests/models/segformer/test_image_processing_segformer.py
@@ -96,7 +96,6 @@ def prepare_semantic_batch_inputs():
 @require_torch
 @require_vision
 class SegformerImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = SegformerImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 6037170fb1f345..57513800d6ffee 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -160,7 +160,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class SegformerModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             SegformerModel,
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index dfdb24f37b17ee..bfcc580bb4cc2d 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -288,7 +288,6 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 
     def test_model_outputs_equivalence(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
diff --git a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
index c42da75bf635b7..f2c75e702bf765 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_flax_speech_encoder_decoder.py
@@ -33,6 +33,7 @@
     import jax.numpy as jnp
     from flax.training.common_utils import onehot
     from flax.traverse_util import flatten_dict
+
     from transformers import (
         FlaxBartForCausalLM,
         FlaxBertForCausalLM,
@@ -73,7 +74,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -103,7 +104,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -142,7 +143,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -169,7 +170,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
@@ -208,7 +209,7 @@ def check_encoder_decoder_model_from_encoder_decoder_pretrained(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         # assert that loading encoder and decoder models from configs has been correctly executed
@@ -253,7 +254,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -336,7 +337,7 @@ def check_freeze_feature_encoder(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         enc_dec_model = FlaxSpeechEncoderDecoderModel(encoder_decoder_config)
@@ -406,7 +407,6 @@ def compute_loss(
             self.assertTrue((grad == grad_frozen).all())
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -448,7 +448,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5)
 
     def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
@@ -460,7 +459,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = SpeechEncoderDecoderModel(encoder_decoder_config)
@@ -508,7 +506,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         config = config_inputs_dict.pop("config")
         decoder_config = config_inputs_dict.pop("decoder_config")
diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
index 3ecca17324a34e..368232331a2ac0 100644
--- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
+++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py
@@ -62,7 +62,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -95,7 +95,7 @@ def check_encoder_decoder_model(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -135,7 +135,7 @@ def check_encoder_decoder_model_with_inputs(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         inputs = input_values if input_features is None else input_features
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
@@ -173,7 +173,7 @@ def check_encoder_decoder_model_from_pretrained(
         return_dict,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -202,7 +202,7 @@ def check_save_and_load(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -245,7 +245,7 @@ def check_save_and_load_encoder_decoder_model(
         decoder_attention_mask,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = SpeechEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -292,7 +292,7 @@ def check_encoder_decoder_model_output_attentions(
         labels=None,
         input_values=None,
         input_features=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
index 244b748c7139ff..749323d3a8c9c6 100644
--- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
+++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py
@@ -104,7 +104,6 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = Speech2TextFeatureExtractor if is_speech_available() else None
 
     def setUp(self):
diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py
index 2720f531af9179..34cf071bd11aa6 100644
--- a/tests/models/speecht5/test_feature_extraction_speecht5.py
+++ b/tests/models/speecht5/test_feature_extraction_speecht5.py
@@ -149,7 +149,6 @@ def prepare_inputs_for_target(self, equal_length=False, numpify=False):
 @require_torch
 @require_torchaudio
 class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = SpeechT5FeatureExtractor if is_speech_available() else None
 
     def setUp(self):
diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py
index f064611b6a9e98..14dce7e9757854 100644
--- a/tests/models/splinter/test_modeling_splinter.py
+++ b/tests/models/splinter/test_modeling_splinter.py
@@ -208,7 +208,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class SplinterModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             SplinterModel,
@@ -338,7 +337,6 @@ def test_multi_gpu_data_parallel_forward(self):
                 inputs_dict[k] = v.to(0)
 
         for model_class in self.all_model_classes:
-
             # Skip this case since it will fail sometimes, as described above.
             if model_class == SplinterForPreTraining:
                 continue
diff --git a/tests/models/squeezebert/test_modeling_squeezebert.py b/tests/models/squeezebert/test_modeling_squeezebert.py
index cffc4570a05918..8b8187cbc3f3a6 100644
--- a/tests/models/squeezebert/test_modeling_squeezebert.py
+++ b/tests/models/squeezebert/test_modeling_squeezebert.py
@@ -215,7 +215,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             SqueezeBertModel,
diff --git a/tests/models/squeezebert/test_tokenization_squeezebert.py b/tests/models/squeezebert/test_tokenization_squeezebert.py
index 88d715bcc1409f..a65862556405e8 100644
--- a/tests/models/squeezebert/test_tokenization_squeezebert.py
+++ b/tests/models/squeezebert/test_tokenization_squeezebert.py
@@ -22,7 +22,6 @@
 
 @require_tokenizers
 class SqueezeBertTokenizationTest(BertTokenizationTest):
-
     tokenizer_class = SqueezeBertTokenizer
     rust_tokenizer_class = SqueezeBertTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 0b780d74b5b3f2..a56ba40978e2c0 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -219,7 +219,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class SwinModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             SwinModel,
diff --git a/tests/models/swin/test_modeling_tf_swin.py b/tests/models/swin/test_modeling_tf_swin.py
index be5861ce48b4a8..bc2e957c433f7d 100644
--- a/tests/models/swin/test_modeling_tf_swin.py
+++ b/tests/models/swin/test_modeling_tf_swin.py
@@ -177,7 +177,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFSwinModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFSwinModel,
@@ -335,7 +334,6 @@ def check_hidden_states_output(self, inputs_dict, config, model_class, image_siz
         )
 
     def test_hidden_states_output(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         image_size = to_2tuple(self.model_tester.image_size)
diff --git a/tests/models/swin2sr/test_image_processing_swin2sr.py b/tests/models/swin2sr/test_image_processing_swin2sr.py
index 06e9539693e1bc..1cb19387ffb974 100644
--- a/tests/models/swin2sr/test_image_processing_swin2sr.py
+++ b/tests/models/swin2sr/test_image_processing_swin2sr.py
@@ -101,7 +101,6 @@ def prepare_inputs(self, equal_resolution=False, numpify=False, torchify=False):
 @require_torch
 @require_vision
 class Swin2SRImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = Swin2SRImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index 5bd54d7a79f9b9..556decee8f4796 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -156,7 +156,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class Swin2SRModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (Swin2SRModel, Swin2SRForImageSuperResolution) if is_torch_available() else ()
 
     fx_compatible = False
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index 13a39b139c81aa..b0c689cfde3ddd 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -171,7 +171,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class Swinv2ModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (Swinv2Model, Swinv2ForImageClassification, Swinv2ForMaskedImageModeling) if is_torch_available() else ()
     )
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 20e089a3133f4e..283fd1710168df 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -73,7 +73,6 @@ def __init__(
         expert_capacity=100,
         router_jitter_noise=0.0,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -548,7 +547,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class SwitchTransformersModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (SwitchTransformersModel, SwitchTransformersForConditionalGeneration) if is_torch_available() else ()
     )
@@ -828,7 +826,6 @@ def __init__(
         pad_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
diff --git a/tests/models/t5/test_modeling_flax_t5.py b/tests/models/t5/test_modeling_flax_t5.py
index 10e6622bb7df14..a2a80ab25bb662 100644
--- a/tests/models/t5/test_modeling_flax_t5.py
+++ b/tests/models/t5/test_modeling_flax_t5.py
@@ -46,6 +46,7 @@
     from flax.core.frozen_dict import unfreeze
     from flax.training.common_utils import onehot
     from flax.traverse_util import flatten_dict
+
     from transformers import FLAX_MODEL_MAPPING, ByT5Tokenizer, T5Config, T5Tokenizer
     from transformers.modeling_flax_pytorch_utils import load_flax_weights_in_pytorch_model
     from transformers.models.t5.modeling_flax_t5 import (
@@ -81,7 +82,6 @@ def __init__(
         scope=None,
         decoder_layers=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -228,7 +228,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxT5ModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxT5Model, FlaxT5ForConditionalGeneration) if is_flax_available() else ()
     all_generative_model_classes = (FlaxT5ForConditionalGeneration,) if is_flax_available() else ()
     is_encoder_decoder = True
@@ -489,7 +488,6 @@ def __init__(
         decoder_start_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -576,7 +574,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxT5EncoderOnlyModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxT5EncoderModel,) if is_flax_available() else ()
     is_encoder_decoder = False
 
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index fe3ce7597bfee5..c6c0ede07121b8 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -73,7 +73,6 @@ def __init__(
         scope=None,
         decoder_layers=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
@@ -520,7 +519,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (T5ForConditionalGeneration,) if is_torch_available() else ()
     all_parallelizable_model_classes = (T5Model, T5ForConditionalGeneration) if is_torch_available() else ()
@@ -703,7 +701,6 @@ def __init__(
         pad_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
diff --git a/tests/models/t5/test_modeling_tf_t5.py b/tests/models/t5/test_modeling_tf_t5.py
index 57c991f9f15afc..767f9def7b90bd 100644
--- a/tests/models/t5/test_modeling_tf_t5.py
+++ b/tests/models/t5/test_modeling_tf_t5.py
@@ -240,7 +240,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFT5ModelTest(TFModelTesterMixin, unittest.TestCase):
-
     is_encoder_decoder = True
     all_model_classes = (TFT5Model, TFT5ForConditionalGeneration) if is_tf_available() else ()
     all_generative_model_classes = (TFT5ForConditionalGeneration,) if is_tf_available() else ()
@@ -346,7 +345,6 @@ def __init__(
         pad_token_id=0,
         scope=None,
     ):
-
         self.parent = parent
         self.batch_size = batch_size
         self.encoder_seq_length = encoder_seq_length
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index eb429f750ab9cf..8dbef672972ea6 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -38,7 +38,6 @@
 @require_sentencepiece
 @require_tokenizers
 class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = T5Tokenizer
     rust_tokenizer_class = T5TokenizerFast
     test_rust_tokenizer = True
@@ -272,7 +271,6 @@ def test_fast_and_slow_same_result(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [f"<extra_id_{i}>" for i in range(100)] + [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -306,7 +304,6 @@ def test_special_tokens_initialization_with_non_empty_additional_special_tokens(
             tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
 
         for tokenizer_class, tokenizer_utils in tokenizer_list:
-
             with tempfile.TemporaryDirectory() as tmp_dir:
                 tokenizer_utils.save_pretrained(tmp_dir)
 
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index 34bcacefe9ab97..06fbe6f8e2b2b5 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -20,6 +20,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import TableTransformerConfig, is_timm_available, is_vision_available
 from transformers.testing_utils import require_timm, require_vision, slow, torch_device
 
diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py
index 504f3e278ea888..1448c9e33686c3 100644
--- a/tests/models/tapas/test_modeling_tapas.py
+++ b/tests/models/tapas/test_modeling_tapas.py
@@ -409,7 +409,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class TapasModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TapasModel,
diff --git a/tests/models/tapas/test_modeling_tf_tapas.py b/tests/models/tapas/test_modeling_tf_tapas.py
index 2f49b57445baaf..ebb4d0d8bf0155 100644
--- a/tests/models/tapas/test_modeling_tf_tapas.py
+++ b/tests/models/tapas/test_modeling_tf_tapas.py
@@ -419,7 +419,6 @@ def prepare_config_and_inputs_for_common(self):
 @require_tensorflow_probability
 @require_tf
 class TFTapasModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFTapasModel,
diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py
index 89865a78e73394..d3af3276f751b9 100644
--- a/tests/models/tapas/test_tokenization_tapas.py
+++ b/tests/models/tapas/test_tokenization_tapas.py
@@ -90,7 +90,6 @@ def get_clean_sequence(
         add_special_tokens: bool = True,
         return_table_and_query: bool = False,
     ):
-
         toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
 
         if empty_table:
@@ -635,7 +634,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 table, query = self.get_table_and_query(tokenizer)
 
                 sequences = tokenizer.encode(table, query, add_special_tokens=False)
@@ -1040,7 +1038,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
diff --git a/tests/models/tapex/test_tokenization_tapex.py b/tests/models/tapex/test_tokenization_tapex.py
index dec0f507ed3c04..9bc61acb7f0d2e 100644
--- a/tests/models/tapex/test_tokenization_tapex.py
+++ b/tests/models/tapex/test_tokenization_tapex.py
@@ -84,7 +84,6 @@ def get_clean_sequence(
         add_special_tokens: bool = True,
         return_table_and_query: bool = False,
     ):
-
         toks = [tokenizer.decode([i], clean_up_tokenization_spaces=False) for i in range(len(tokenizer))]
 
         if empty_table:
@@ -364,7 +363,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 table, query = self.get_table_and_query(tokenizer)
 
                 sequences = tokenizer.encode(table, query, add_special_tokens=False)
diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
index a3973a39edd887..1f64d381b1d794 100644
--- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
+++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py
@@ -19,6 +19,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import is_torch_available
 from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
 
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 8f95b1d0189d86..5a9e039e7fd6a5 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -20,8 +20,8 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers import TimesformerConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
diff --git a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
index 5bdb52450dbfaa..07c898bf6fda01 100644
--- a/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
+++ b/tests/models/trajectory_transformer/test_modeling_trajectory_transformer.py
@@ -95,7 +95,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class TrajectoryTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (TrajectoryTransformerModel,) if is_torch_available() else ()
 
     # Ignoring of a failing test from GenerationTesterMixin, as the model does not use inputs_ids
diff --git a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
index 84e25d8716f5f8..c0051eab2dd855 100644
--- a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
+++ b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
@@ -156,7 +156,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFTransfoXLModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (TFTransfoXLModel, TFTransfoXLLMHeadModel, TFTransfoXLForSequenceClassification) if is_tf_available() else ()
     )
diff --git a/tests/models/transfo_xl/test_tokenization_transfo_xl.py b/tests/models/transfo_xl/test_tokenization_transfo_xl.py
index 3f7065c51b4739..15b712ff3784e3 100644
--- a/tests/models/transfo_xl/test_tokenization_transfo_xl.py
+++ b/tests/models/transfo_xl/test_tokenization_transfo_xl.py
@@ -23,7 +23,6 @@
 
 
 class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = TransfoXLTokenizer
     test_rust_tokenizer = False
     test_seq2seq = False
diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py
index 228b0dd175f86a..1f108176959b70 100644
--- a/tests/models/unispeech/test_modeling_unispeech.py
+++ b/tests/models/unispeech/test_modeling_unispeech.py
@@ -546,7 +546,6 @@ def _load_datasamples(self, num_samples):
         return [x["array"] for x in speech_samples]
 
     def _load_superb(self, task, num_samples):
-
         ds = load_dataset("anton-l/superb_dummy", task, split="test")
 
         return ds[:num_samples]
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index f3982a79df79a9..f2ab1cb1bde2e5 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -19,6 +19,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import ConvNextConfig, UperNetConfig
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
diff --git a/tests/models/videomae/test_image_processing_videomae.py b/tests/models/videomae/test_image_processing_videomae.py
index 53676328450bc5..70980c195cff33 100644
--- a/tests/models/videomae/test_image_processing_videomae.py
+++ b/tests/models/videomae/test_image_processing_videomae.py
@@ -81,7 +81,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class VideoMAEImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = VideoMAEImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index bc665410b6c6b9..63e6f19ffca297 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -20,8 +20,8 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers import VideoMAEConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
diff --git a/tests/models/vilt/test_image_processing_vilt.py b/tests/models/vilt/test_image_processing_vilt.py
index a89fd9b854d1af..f33492d1020578 100644
--- a/tests/models/vilt/test_image_processing_vilt.py
+++ b/tests/models/vilt/test_image_processing_vilt.py
@@ -117,7 +117,6 @@ def get_expected_values(self, image_inputs, batched=False):
 @require_torch
 @require_vision
 class ViltImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ViltImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
index aaaf62c5a0b464..c4f84012c73a7b 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_flax_vision_encoder_decoder.py
@@ -70,7 +70,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -100,7 +100,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -126,7 +126,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model}
@@ -162,7 +162,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -236,7 +236,6 @@ def check_encoder_decoder_model_generate(self, pixel_values, config, decoder_con
         self.assertEqual(generated_sequences.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -278,7 +277,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 1e-5)
 
     def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
@@ -290,7 +288,6 @@ def check_equivalence_pt_to_flax(self, config, decoder_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, config, decoder_config, inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
 
         pt_model = VisionEncoderDecoderModel(encoder_decoder_config)
@@ -330,7 +327,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         config = config_inputs_dict.pop("config")
         decoder_config = config_inputs_dict.pop("decoder_config")
@@ -442,7 +438,6 @@ def get_from_encoderdecoder_pretrained_model(self):
         )
 
     def _check_configuration_tie(self, model):
-
         module = model.module.bind(model.params)
 
         assert id(module.decoder.config) == id(model.config.decoder)
@@ -465,7 +460,6 @@ def prepare_img():
 class FlaxViT2GPT2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_coco_en(self):
-
         loc = "ydshieh/vit-gpt2-coco-en"
 
         feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
@@ -501,7 +495,6 @@ def test_inference_coco_en(self):
         self.assertLessEqual(max_diff, 1e-4)
 
         def generate_step(pixel_values):
-
             outputs = model.generate(pixel_values, max_length=16, num_beams=4)
             output_ids = outputs.sequences
             preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
diff --git a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
index ada036e2aa543e..498f22b17e4f85 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_tf_vision_encoder_decoder.py
@@ -84,7 +84,7 @@ def check_encoder_decoder_model_from_pretrained_configs(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         self.assertTrue(encoder_decoder_config.decoder.is_decoder)
@@ -114,7 +114,7 @@ def check_encoder_decoder_model(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -158,7 +158,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_input_ids,
         decoder_attention_mask,
         return_dict,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -185,7 +185,7 @@ def check_save_and_load(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -223,7 +223,7 @@ def check_encoder_decoder_model_labels(
         decoder_input_ids,
         decoder_attention_mask,
         labels,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         enc_dec_model = TFVisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
@@ -253,7 +253,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_config,
         decoder_input_ids,
         decoder_attention_mask,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -403,7 +403,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-
         pt_inputs_dict = {}
         for name, key in tf_inputs_dict.items():
             if type(key) == bool:
@@ -423,7 +422,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
         return pt_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -463,7 +461,6 @@ def check_pt_tf_equivalence(self, tf_model, pt_model, tf_inputs_dict):
         self.check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)
 
     def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         # Output all for aggressive testing
         encoder_decoder_config.output_hidden_states = True
@@ -479,7 +476,6 @@ def check_pt_to_tf_equivalence(self, config, decoder_config, tf_inputs_dict):
         self.check_pt_tf_equivalence(tf_model, pt_model, tf_inputs_dict)
 
     def check_tf_to_pt_equivalence(self, config, decoder_config, tf_inputs_dict):
-
         encoder_decoder_config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config, decoder_config)
         # Output all for aggressive testing
         encoder_decoder_config.output_hidden_states = True
@@ -534,7 +530,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
 
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         labels = config_inputs_dict.pop("decoder_token_labels")
 
@@ -839,7 +834,6 @@ def test_encoder_decoder_from_pretrained(self):
         decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids
 
         with tempfile.TemporaryDirectory() as tmp_dirname:
-
             # Since most of HF's models don't have pretrained cross-attention layers, they are randomly
             # initialized even if we create models using `from_pretrained` method.
             # For the tests, the decoder need to be a model with pretrained cross-attention layers.
@@ -895,7 +889,6 @@ def test_encoder_decoder_from_pretrained(self):
 class TFViT2GPT2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_coco_en(self):
-
         loc = "ydshieh/vit-gpt2-coco-en"
 
         feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 6228cb51fd5ac4..52b23195b1db06 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -135,7 +135,7 @@ def check_encoder_decoder_model_from_pretrained(
         decoder_attention_mask,
         return_dict,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
         kwargs = {"encoder_model": encoder_model, "decoder_model": decoder_model, "return_dict": return_dict}
@@ -226,7 +226,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_attention_mask,
         labels=None,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -402,7 +402,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_attention_mask,
         labels=None,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -590,7 +590,7 @@ def check_encoder_decoder_model_output_attentions(
         decoder_attention_mask,
         labels=None,
         pixel_values=None,
-        **kwargs
+        **kwargs,
     ):
         # make the decoder inputs a different shape from the encoder inputs to harden the test
         decoder_input_ids = decoder_input_ids[:, :-1]
@@ -747,7 +747,6 @@ def test_inference_printed(self):
 class ViT2GPT2ModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_coco_en(self):
-
         loc = "ydshieh/vit-gpt2-coco-en"
 
         feature_extractor = ViTFeatureExtractor.from_pretrained(loc)
@@ -787,7 +786,6 @@ def test_inference_coco_en(self):
         self.assertLessEqual(max_diff, 1e-4)
 
         def generate_step(pixel_values):
-
             outputs = model.generate(
                 pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True, output_scores=True
             )
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
index cb476c128aa685..1cfa04e67f3994 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_flax_vision_text_dual_encoder.py
@@ -100,7 +100,6 @@ def check_model_from_pretrained_configs(
     def check_vision_text_dual_encoder_from_pretrained(
         self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
     ):
-
         vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
         kwargs = {"vision_model": vision_model, "text_model": text_model}
         model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
@@ -157,7 +156,6 @@ def check_vision_text_output_attention(
         )
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -199,7 +197,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, inputs_dict):
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 4e-2)
 
     def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -211,7 +208,6 @@ def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, inputs_dict)
 
     def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -239,7 +235,6 @@ def test_vision_text_output_attention(self):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         vision_config = config_inputs_dict.pop("vision_config")
         text_config = config_inputs_dict.pop("text_config")
@@ -311,7 +306,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": attention_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "token_type_ids": token_type_ids,
         }
@@ -362,7 +356,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": attention_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "token_type_ids": token_type_ids,
         }
diff --git a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
index 18182047d66475..c3c2321c24ee71 100644
--- a/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
+++ b/tests/models/vision_text_dual_encoder/test_modeling_vision_text_dual_encoder.py
@@ -108,7 +108,6 @@ def check_vision_text_dual_encoder_model(
     def check_vision_text_dual_encoder_from_pretrained(
         self, text_config, input_ids, attention_mask, vision_config, pixel_values=None, **kwargs
     ):
-
         vision_model, text_model = self.get_vision_text_model(vision_config, text_config)
         kwargs = {"vision_model": vision_model, "text_model": text_model}
         model = VisionTextDualEncoderModel.from_vision_text_pretrained(**kwargs)
@@ -175,7 +174,6 @@ def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         self.assertLessEqual(diff, tol, f"Difference between torch and flax is {diff} (>= {tol}).")
 
     def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mask, pixel_values, **kwargs):
-
         pt_model.to(torch_device)
         pt_model.eval()
 
@@ -218,7 +216,6 @@ def check_pt_flax_equivalence(self, pt_model, fx_model, input_ids, attention_mas
             self.assert_almost_equals(fx_output, pt_output_loaded.numpy(), 4e-2)
 
     def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -230,7 +227,6 @@ def check_equivalence_pt_to_flax(self, vision_config, text_config, inputs_dict):
         self.check_pt_flax_equivalence(pt_model, fx_model, **inputs_dict)
 
     def check_equivalence_flax_to_pt(self, vision_config, text_config, inputs_dict):
-
         config = VisionTextDualEncoderConfig.from_vision_text_configs(vision_config, text_config)
 
         pt_model = VisionTextDualEncoderModel(config)
@@ -262,7 +258,6 @@ def test_vision_text_output_attention(self):
 
     @is_pt_flax_cross_test
     def test_pt_flax_equivalence(self):
-
         config_inputs_dict = self.prepare_config_and_inputs()
         vision_config = config_inputs_dict.pop("vision_config")
         text_config = config_inputs_dict.pop("text_config")
@@ -341,7 +336,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": input_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "text_token_type_ids": token_type_ids,
             "text_sequence_labels": sequence_labels,
@@ -429,7 +423,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": input_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "text_token_type_ids": token_type_ids,
             "text_sequence_labels": sequence_labels,
@@ -491,7 +484,6 @@ def prepare_config_and_inputs(self):
             "vision_config": vision_config,
             "pixel_values": pixel_values,
             "attention_mask": input_mask,
-            "text_config": text_config,
             "input_ids": input_ids,
             "text_token_type_ids": token_type_ids,
             "text_sequence_labels": sequence_labels,
diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py
index 92ed812fe47d1e..00b0eb3635a54b 100644
--- a/tests/models/visual_bert/test_modeling_visual_bert.py
+++ b/tests/models/visual_bert/test_modeling_visual_bert.py
@@ -300,7 +300,6 @@ def create_and_check_for_flickr(self, config, input_dict):
 
 @require_torch
 class VisualBertModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             VisualBertModel,
diff --git a/tests/models/vit/test_image_processing_vit.py b/tests/models/vit/test_image_processing_vit.py
index ce0cc5610a8368..171ce65e74f17d 100644
--- a/tests/models/vit/test_image_processing_vit.py
+++ b/tests/models/vit/test_image_processing_vit.py
@@ -74,7 +74,6 @@ def prepare_image_processor_dict(self):
 @require_torch
 @require_vision
 class ViTImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = ViTImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/vit/test_modeling_flax_vit.py b/tests/models/vit/test_modeling_flax_vit.py
index 611f9364885450..ca3130493eda1d 100644
--- a/tests/models/vit/test_modeling_flax_vit.py
+++ b/tests/models/vit/test_modeling_flax_vit.py
@@ -25,8 +25,8 @@
 
 
 if is_flax_available():
-
     import jax
+
     from transformers.models.vit.modeling_flax_vit import FlaxViTForImageClassification, FlaxViTModel
 
 
@@ -125,7 +125,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_flax
 class FlaxViTModelTest(FlaxModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxViTModel, FlaxViTForImageClassification) if is_flax_available() else ()
 
     def setUp(self) -> None:
diff --git a/tests/models/vit/test_modeling_tf_vit.py b/tests/models/vit/test_modeling_tf_vit.py
index 7f452886f150a3..95ed814eba655e 100644
--- a/tests/models/vit/test_modeling_tf_vit.py
+++ b/tests/models/vit/test_modeling_tf_vit.py
@@ -206,7 +206,6 @@ def test_for_image_classification(self):
 
     @slow
     def test_model_from_pretrained(self):
-
         model = TFViTModel.from_pretrained("google/vit-base-patch16-224")
         self.assertIsNotNone(model)
 
diff --git a/tests/models/vit_mae/test_modeling_tf_vit_mae.py b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
index 3bc582cb1fcd07..8c19c0149112b5 100644
--- a/tests/models/vit_mae/test_modeling_tf_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_tf_vit_mae.py
@@ -266,7 +266,6 @@ def prepare_numpy_arrays(inputs_dict):
     # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         # make masks reproducible
         np.random.seed(2)
 
@@ -453,7 +452,6 @@ def test_model_outputs_equivalence(self):
 
     @slow
     def test_model_from_pretrained(self):
-
         model = TFViTMAEModel.from_pretrained("google/vit-base-patch16-224")
         self.assertIsNotNone(model)
 
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 5a48d253a385ee..35693da4cf60b1 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -209,7 +209,6 @@ def test_for_pretraining(self):
     # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
     def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-
         # make masks reproducible
         np.random.seed(2)
 
@@ -224,7 +223,6 @@ def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
         super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
 
     def test_save_load(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
diff --git a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
index 98cf2f1c495bc6..44f2ed5b87362d 100644
--- a/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
+++ b/tests/models/wav2vec2/test_feature_extraction_wav2vec2.py
@@ -96,7 +96,6 @@ def _flatten(list_of_lists):
 
 
 class Wav2Vec2FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = Wav2Vec2FeatureExtractor
 
     def setUp(self):
diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
index ac1dd3bcb44ae5..33388eb6d34f97 100644
--- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py
@@ -45,6 +45,7 @@
     import jax.numpy as jnp
     import optax
     from flax.traverse_util import flatten_dict
+
     from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor
     from transformers.models.wav2vec2.modeling_flax_wav2vec2 import (
         FlaxWav2Vec2ForCTC,
@@ -58,6 +59,7 @@
 
 if is_pyctcdecode_available():
     import pyctcdecode.decoder
+
     from transformers import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
@@ -67,7 +69,6 @@
 
 
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -276,7 +277,6 @@ def model_jitted(input_values, attention_mask=None, **kwargs):
 
                 self.assertEqual(len(outputs), len(jitted_outputs))
                 for jitted_output, output in zip(jitted_outputs, outputs):
-
                     self.assertEqual(jitted_output.shape, output.shape)
 
     def test_freeze_feature_encoder(self):
diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
index 8f9a8f0bd73bd7..38e83bcdf9e594 100644
--- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py
@@ -26,8 +26,8 @@
 import numpy as np
 import pytest
 from datasets import load_dataset
-
 from huggingface_hub import snapshot_download
+
 from transformers import Wav2Vec2Config, is_tf_available
 from transformers.testing_utils import (
     CaptureLogger,
@@ -53,6 +53,7 @@
 
 if is_pyctcdecode_available():
     import pyctcdecode.decoder
+
     from transformers import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
@@ -62,7 +63,6 @@
 
 
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -283,7 +283,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFWav2Vec2ModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFWav2Vec2Model, TFWav2Vec2ForCTC) if is_tf_available() else ()
     test_resize_embeddings = False
     test_head_masking = False
diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py
index 9fe18fdf57c8ae..35df9fc223b29b 100644
--- a/tests/models/wav2vec2/test_modeling_wav2vec2.py
+++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py
@@ -81,6 +81,7 @@
 
 if is_pyctcdecode_available():
     import pyctcdecode.decoder
+
     from transformers import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm import processing_wav2vec2_with_lm
 
@@ -90,7 +91,6 @@
 
 
 def _test_wav2vec2_with_lm_invalid_pool(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index 829c6096814455..df5db0a3e298a9 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -24,8 +24,8 @@
 import numpy as np
 from datasets import load_dataset
 from packaging import version
-
 from parameterized import parameterized
+
 from transformers import AutoProcessor
 from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
 from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
@@ -38,6 +38,7 @@
 if is_pyctcdecode_available():
     from huggingface_hub import snapshot_download
     from pyctcdecode import BeamSearchDecoderCTC
+
     from transformers.models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from transformers.models.wav2vec2_with_lm.processing_wav2vec2_with_lm import Wav2Vec2DecoderWithLMOutput
 
diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py
index c03763cdf63f77..b490556c5fe6b3 100644
--- a/tests/models/whisper/test_feature_extraction_whisper.py
+++ b/tests/models/whisper/test_feature_extraction_whisper.py
@@ -113,7 +113,6 @@ def _flatten(list_of_lists):
 @require_torch
 @require_torchaudio
 class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
-
     feature_extraction_class = WhisperFeatureExtractor if is_speech_available() else None
 
     def setUp(self):
diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py
index 1fd89f2525b893..3bc04e56a6d149 100644
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -629,7 +629,6 @@ def test_lm_head_model_random_beam_search_generate(self):
 
 
 def _load_datasamples(num_samples):
-
     ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
     # automatic decoding with librispeech
     speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
@@ -638,7 +637,6 @@ def _load_datasamples(num_samples):
 
 
 def _test_large_logits_librispeech(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -687,7 +685,6 @@ def _test_large_logits_librispeech(in_queue, out_queue, timeout):
 
 
 def _test_large_generation(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -715,7 +712,6 @@ def _test_large_generation(in_queue, out_queue, timeout):
 
 
 def _test_large_generation_multilingual(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
@@ -761,7 +757,6 @@ def _test_large_generation_multilingual(in_queue, out_queue, timeout):
 
 
 def _test_large_batched_generation(in_queue, out_queue, timeout):
-
     error = None
     try:
         _ = in_queue.get(timeout=timeout)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 0944a3a64634dd..54382c2884e633 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -756,7 +756,6 @@ def default_processor(self):
         return WhisperProcessor.from_pretrained("openai/whisper-base")
 
     def _load_datasamples(self, num_samples):
-
         ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         # automatic decoding with librispeech
         speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
@@ -886,7 +885,6 @@ def test_large_logits_librispeech(self):
 
     @slow
     def test_tiny_en_generation(self):
-
         torch_device = "cpu"
         set_seed(0)
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -910,7 +908,6 @@ def test_tiny_en_generation(self):
 
     @slow
     def test_tiny_generation(self):
-
         torch_device = "cpu"
         set_seed(0)
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py
index b4f3252e2fa46d..73550c061825bd 100644
--- a/tests/models/x_clip/test_modeling_x_clip.py
+++ b/tests/models/x_clip/test_modeling_x_clip.py
@@ -21,8 +21,8 @@
 import unittest
 
 import numpy as np
-
 from huggingface_hub import hf_hub_download
+
 from transformers import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import is_torch_available, is_vision_available
@@ -393,7 +393,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class XCLIPTextModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (XCLIPTextModel,) if is_torch_available() else ()
     fx_compatible = False
     test_pruning = False
@@ -445,7 +444,6 @@ def __init__(
         mit_hidden_size=64,
         is_training=True,
     ):
-
         if text_kwargs is None:
             text_kwargs = {}
         if vision_kwargs is None:
diff --git a/tests/models/xglm/test_modeling_flax_xglm.py b/tests/models/xglm/test_modeling_flax_xglm.py
index 924c73321e9036..60436cb1f9a355 100644
--- a/tests/models/xglm/test_modeling_flax_xglm.py
+++ b/tests/models/xglm/test_modeling_flax_xglm.py
@@ -26,10 +26,10 @@
 
 
 if is_flax_available():
-    import numpy as np
-
     import jax
     import jax.numpy as jnp
+    import numpy as np
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -196,7 +196,6 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 @require_sentencepiece
 @require_flax
 class FlaxXGLMModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (FlaxXGLMModel, FlaxXGLMForCausalLM) if is_flax_available() else ()
     all_generative_model_classes = (FlaxXGLMForCausalLM,) if is_flax_available() else ()
 
diff --git a/tests/models/xglm/test_modeling_tf_xglm.py b/tests/models/xglm/test_modeling_tf_xglm.py
index b6387901dc955b..6f4069a40c712d 100644
--- a/tests/models/xglm/test_modeling_tf_xglm.py
+++ b/tests/models/xglm/test_modeling_tf_xglm.py
@@ -140,7 +140,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFXGLMModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (TFXGLMModel, TFXGLMForCausalLM) if is_tf_available() else ()
     all_generative_model_classes = (TFXGLMForCausalLM,) if is_tf_available() else ()
     test_onnx = False
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 662299fb7eb1d9..f906ebf899b72e 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -295,7 +295,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class XGLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (XGLMModel, XGLMForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (XGLMForCausalLM,) if is_torch_available() else ()
     fx_compatible = True
diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py
index 05259ffaf9a335..74dd4dab5e3e75 100644
--- a/tests/models/xglm/test_tokenization_xglm.py
+++ b/tests/models/xglm/test_tokenization_xglm.py
@@ -31,7 +31,6 @@
 @require_sentencepiece
 @require_tokenizers
 class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XGLMTokenizer
     rust_tokenizer_class = XGLMTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/xlm/test_modeling_tf_xlm.py b/tests/models/xlm/test_modeling_tf_xlm.py
index 00e77cee64ba89..d443b7f843b25a 100644
--- a/tests/models/xlm/test_modeling_tf_xlm.py
+++ b/tests/models/xlm/test_modeling_tf_xlm.py
@@ -277,7 +277,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFXLMModel,
diff --git a/tests/models/xlm/test_modeling_xlm.py b/tests/models/xlm/test_modeling_xlm.py
index 190e1e958377cd..3e8a5efc894800 100644
--- a/tests/models/xlm/test_modeling_xlm.py
+++ b/tests/models/xlm/test_modeling_xlm.py
@@ -360,7 +360,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             XLMModel,
diff --git a/tests/models/xlm/test_tokenization_xlm.py b/tests/models/xlm/test_tokenization_xlm.py
index adb4835eda4070..6e3103521585c8 100644
--- a/tests/models/xlm/test_tokenization_xlm.py
+++ b/tests/models/xlm/test_tokenization_xlm.py
@@ -25,7 +25,6 @@
 
 
 class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLMTokenizer
     test_rust_tokenizer = False
 
diff --git a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
index 5dec186bc7b9ce..4cd6eb9b5f9d5a 100644
--- a/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
+++ b/tests/models/xlm_prophetnet/test_modeling_xlm_prophetnet.py
@@ -142,9 +142,8 @@ def test_xprophetnet_ntg_inference(self):
             tokenizer.convert_ids_to_tokens(g, skip_special_tokens=True) for g in summary_ids_beam1
         ]
         EXPECTED_TITLE_EN_BEAM1_TOK = "▁Microsoft ▁to ▁end ▁free ▁support ▁for ▁Windows ▁7".split(" ")
-        EXPECTED_TITLE_RU_BEAM1_TOK = (
-            "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года"
-            .split(" ")
+        EXPECTED_TITLE_RU_BEAM1_TOK = "▁Microsoft ▁намерен а ▁прекрати ть ▁бес плат ную ▁поддержку ▁Windows ▁7 ▁после ▁14 ▁января ▁2020 ▁года".split(
+            " "
         )
         EXPECTED_TITLE_ZH_BEAM1_TOK = "微软 公司 打算 终止 对 Windows ▁7 操作 系统的 免费 支持".split(" ")
         self.assertListEqual(
diff --git a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
index d560007fe3163f..13c02b5415f8fb 100644
--- a/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
+++ b/tests/models/xlm_prophetnet/test_tokenization_xlm_prophetnet.py
@@ -27,7 +27,6 @@
 
 @require_sentencepiece
 class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLMProphetNetTokenizer
     test_rust_tokenizer = False
     test_sentencepiece = True
diff --git a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
index c821cda6f3ce0c..0ceaa739f3fa86 100644
--- a/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_modeling_flax_xlm_roberta.py
@@ -22,6 +22,7 @@
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers import FlaxXLMRobertaModel
 
 
diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
index c8f934b258b93c..0dde56481cc7a5 100644
--- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -31,7 +31,6 @@
 @require_sentencepiece
 @require_tokenizers
 class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLMRobertaTokenizer
     rust_tokenizer_class = XLMRobertaTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
index 6c9577be777fd4..4efb48cdf0b7e0 100644
--- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
+++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py
@@ -358,7 +358,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class XLMRobertaXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             XLMRobertaXLForCausalLM,
diff --git a/tests/models/xlnet/test_modeling_tf_xlnet.py b/tests/models/xlnet/test_modeling_tf_xlnet.py
index bc8f31006bd4ba..a8686d4a2b1f78 100644
--- a/tests/models/xlnet/test_modeling_tf_xlnet.py
+++ b/tests/models/xlnet/test_modeling_tf_xlnet.py
@@ -332,7 +332,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_tf
 class TFXLNetModelTest(TFModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             TFXLNetModel,
diff --git a/tests/models/xlnet/test_tokenization_xlnet.py b/tests/models/xlnet/test_tokenization_xlnet.py
index 6125a1dffd7791..a9f39202f4a175 100644
--- a/tests/models/xlnet/test_tokenization_xlnet.py
+++ b/tests/models/xlnet/test_tokenization_xlnet.py
@@ -27,7 +27,6 @@
 @require_sentencepiece
 @require_tokenizers
 class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-
     tokenizer_class = XLNetTokenizer
     rust_tokenizer_class = XLNetTokenizerFast
     test_rust_tokenizer = True
diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py
index b262a654a5a4a6..937cb6fac6e131 100644
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -115,7 +115,6 @@ def get_expected_values(self, image_inputs, batched=False):
 @require_torch
 @require_vision
 class YolosImageProcessingTest(ImageProcessingSavingTestMixin, unittest.TestCase):
-
     image_processing_class = YolosImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py
index 0a0749dd7d9bcd..5f72f992eea541 100644
--- a/tests/models/yoso/test_modeling_yoso.py
+++ b/tests/models/yoso/test_modeling_yoso.py
@@ -281,7 +281,6 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 class YosoModelTest(ModelTesterMixin, unittest.TestCase):
-
     all_model_classes = (
         (
             YosoModel,
diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py
index e7a0e15d243d8d..792c2a65cb0c44 100644
--- a/tests/onnx/test_onnx_v2.py
+++ b/tests/onnx/test_onnx_v2.py
@@ -5,8 +5,8 @@
 from unittest.mock import patch
 
 import pytest
-
 from parameterized import parameterized
+
 from transformers import AutoConfig, PreTrainedTokenizerBase, is_tf_available, is_torch_available
 from transformers.onnx import (
     EXTERNAL_DATA_FORMAT_SIZE_LIMIT,
diff --git a/tests/optimization/test_optimization.py b/tests/optimization/test_optimization.py
index c0c5a31a3a49de..551b24a48ec59a 100644
--- a/tests/optimization/test_optimization.py
+++ b/tests/optimization/test_optimization.py
@@ -120,7 +120,6 @@ def assertListAlmostEqual(self, list1, list2, tol, msg=None):
             self.assertAlmostEqual(a, b, delta=tol, msg=msg)
 
     def test_schedulers(self):
-
         common_kwargs = {"num_warmup_steps": 2, "num_training_steps": 10}
         # schedulers doct format
         # function: (sched_args_dict, expected_learning_rates)
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 413fc7657f6cfc..2bda09fe00c7fe 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -17,8 +17,8 @@
 import numpy as np
 import pytest
 from datasets import load_dataset
-
 from huggingface_hub import snapshot_download
+
 from transformers import (
     MODEL_FOR_CTC_MAPPING,
     MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
@@ -277,7 +277,6 @@ def test_torch_small_no_tokenizer_files(self):
     @require_torch
     @slow
     def test_torch_large(self):
-
         speech_recognizer = pipeline(
             task="automatic-speech-recognition",
             model="facebook/wav2vec2-base-960h",
@@ -645,7 +644,6 @@ def test_simple_wav2vec2(self):
     @require_torch
     @require_torchaudio
     def test_simple_s2t(self):
-
         model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-mustc-en-it-st")
         tokenizer = AutoTokenizer.from_pretrained("facebook/s2t-small-mustc-en-it-st")
         feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-mustc-en-it-st")
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 2213ec6ca89a71..8e38c6dcd6186b 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -26,10 +26,10 @@
 
 import datasets
 import numpy as np
-
 import requests
 from huggingface_hub import HfFolder, Repository, create_repo, delete_repo, set_access_token
 from requests.exceptions import HTTPError
+
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
@@ -119,7 +119,6 @@ def is_test_to_skip(test_casse_name, config_class, model_architecture, tokenizer
 
     # TODO: check and fix if possible
     if not to_skip and tokenizer_name is not None:
-
         if (
             test_casse_name == "QAPipelineTests"
             and not tokenizer_name.endswith("Fast")
@@ -196,7 +195,6 @@ def is_test_to_skip(test_casse_name, config_class, model_architecture, tokenizer
 
 
 def validate_test_components(test_case, model, tokenizer, processor):
-
     # TODO: Move this to tiny model creation script
     # head-specific (within a model type) necessary changes to the config
     # 1. for `BlenderbotForCausalLM`
@@ -296,7 +294,6 @@ def data(n):
             mapping = dct.get(key, {})
             if mapping:
                 for config_class, model_architectures in mapping.items():
-
                     if not isinstance(model_architectures, tuple):
                         model_architectures = (model_architectures,)
 
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index d79f2f2abafc40..8e13407246558c 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -44,7 +44,6 @@ def hashimage(image: Image) -> str:
 @require_timm
 @require_torch
 class DepthEstimationPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
 
     def get_test_pipeline(self, model, tokenizer, processor):
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 3ac4b70737ca0b..a1a0c7b3714504 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -18,9 +18,9 @@
 
 import datasets
 import numpy as np
+import requests
 from datasets import load_dataset
 
-import requests
 from transformers import (
     MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
     MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 601606a4cd6cb0..907419618339a5 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -15,6 +15,7 @@
 import unittest
 
 from huggingface_hub import hf_hub_download
+
 from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
 from transformers.pipelines import VideoClassificationPipeline, pipeline
 from transformers.testing_utils import (
@@ -47,7 +48,6 @@ def get_test_pipeline(self, model, tokenizer, processor):
         return video_classifier, examples
 
     def run_pipeline_test(self, video_classifier, examples):
-
         for example in examples:
             outputs = video_classifier(example)
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index fea8bf1c48e995..b84ba20a4c9c45 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -33,7 +33,6 @@ def open(*args, **kwargs):
 @require_vision
 @require_torch
 class ZeroShotObjectDetectionPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
-
     model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
 
     def get_test_pipeline(self, model, tokenizer, processor):
diff --git a/tests/repo_utils/test_check_dummies.py b/tests/repo_utils/test_check_dummies.py
index 8dde0f49443b9c..2411990a16d847 100644
--- a/tests/repo_utils/test_check_dummies.py
+++ b/tests/repo_utils/test_check_dummies.py
@@ -20,7 +20,7 @@
 git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 sys.path.append(os.path.join(git_repo_path, "utils"))
 
-import check_dummies
+import check_dummies  # noqa: E402
 from check_dummies import create_dummy_files, create_dummy_object, find_backend, read_init  # noqa: E402
 
 
diff --git a/tests/sagemaker/conftest.py b/tests/sagemaker/conftest.py
index 8e7c0bbf1d0cda..0ed261864bcd27 100644
--- a/tests/sagemaker/conftest.py
+++ b/tests/sagemaker/conftest.py
@@ -4,7 +4,6 @@
 import os
 
 import pytest
-
 from attr import dataclass
 
 
diff --git a/tests/sagemaker/scripts/tensorflow/run_tf.py b/tests/sagemaker/scripts/tensorflow/run_tf.py
index a47e76c09d6125..03f631d2667995 100644
--- a/tests/sagemaker/scripts/tensorflow/run_tf.py
+++ b/tests/sagemaker/scripts/tensorflow/run_tf.py
@@ -10,7 +10,6 @@
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
 
     # Hyperparameters sent by the client are passed as command-line arguments to the script.
diff --git a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
index 84f4275aafce3a..f8f2e4bcf29d49 100644
--- a/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
+++ b/tests/sagemaker/scripts/tensorflow/run_tf_dist.py
@@ -85,7 +85,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
 
     # Hyperparameters sent by the client are passed as command-line arguments to the script.
@@ -143,7 +142,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size):
 
     # Training
     if args.do_train:
-
         # train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size)
         start_train_time = time.time()
         train_results = fit(
@@ -171,7 +169,6 @@ def get_datasets(tokenizer, train_batch_size, eval_batch_size):
 
     # Evaluation
     if args.do_eval and (not SDP_ENABLED or sdp.rank() == 0):
-
         result = model.evaluate(tf_test_dataset, batch_size=args.per_device_eval_batch_size, return_dict=True)
         logger.info("*** Evaluate ***")
 
diff --git a/tests/sagemaker/test_multi_node_data_parallel.py b/tests/sagemaker/test_multi_node_data_parallel.py
index 8fb60d64a61f8c..cc7f9e5e84f8bf 100644
--- a/tests/sagemaker/test_multi_node_data_parallel.py
+++ b/tests/sagemaker/test_multi_node_data_parallel.py
@@ -5,7 +5,6 @@
 from ast import literal_eval
 
 import pytest
-
 from parameterized import parameterized, parameterized_class
 
 from . import is_sagemaker_available
diff --git a/tests/sagemaker/test_multi_node_model_parallel.py b/tests/sagemaker/test_multi_node_model_parallel.py
index 38a1c9a6b3b7bd..95d5b9fa855904 100644
--- a/tests/sagemaker/test_multi_node_model_parallel.py
+++ b/tests/sagemaker/test_multi_node_model_parallel.py
@@ -5,7 +5,6 @@
 from ast import literal_eval
 
 import pytest
-
 from parameterized import parameterized, parameterized_class
 
 from . import is_sagemaker_available
@@ -50,7 +49,6 @@ def setUp(self):
         assert hasattr(self, "env")
 
     def create_estimator(self, instance_count):
-
         # configuration for running training on smdistributed Model Parallel
         mpi_options = {
             "enabled": True,
diff --git a/tests/sagemaker/test_single_node_gpu.py b/tests/sagemaker/test_single_node_gpu.py
index e71f82d31634e0..f2a62547e787c6 100644
--- a/tests/sagemaker/test_single_node_gpu.py
+++ b/tests/sagemaker/test_single_node_gpu.py
@@ -5,7 +5,6 @@
 from ast import literal_eval
 
 import pytest
-
 from parameterized import parameterized_class
 
 from . import is_sagemaker_available
diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py
index df05d2a4ac84fd..dc9927f1938527 100644
--- a/tests/test_configuration_common.py
+++ b/tests/test_configuration_common.py
@@ -25,6 +25,7 @@
 
 from huggingface_hub import HfFolder, delete_repo, set_access_token
 from requests.exceptions import HTTPError
+
 from transformers import AutoConfig, BertConfig, GPT2Config, is_torch_available
 from transformers.configuration_utils import PretrainedConfig
 from transformers.testing_utils import TOKEN, USER, is_staging_test
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index 5c60cf58ac2533..ee8dfefb8406a0 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -24,6 +24,7 @@
 
 from huggingface_hub import HfFolder, delete_repo, set_access_token
 from requests.exceptions import HTTPError
+
 from transformers import AutoFeatureExtractor, Wav2Vec2FeatureExtractor
 from transformers.testing_utils import TOKEN, USER, check_json_file_has_correct_format, get_tests_dir, is_staging_test
 
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index d8485e3853d840..e18f8bf60f6fc1 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -24,6 +24,7 @@
 
 from huggingface_hub import HfFolder, delete_repo, set_access_token
 from requests.exceptions import HTTPError
+
 from transformers import AutoImageProcessor, ViTImageProcessor
 from transformers.testing_utils import (
     TOKEN,
diff --git a/tests/test_image_transforms.py b/tests/test_image_transforms.py
index 206c8dc5b8fbf1..2287fdbf2ce99d 100644
--- a/tests/test_image_transforms.py
+++ b/tests/test_image_transforms.py
@@ -16,8 +16,8 @@
 import unittest
 
 import numpy as np
-
 from parameterized import parameterized
+
 from transformers.testing_utils import require_flax, require_tf, require_torch, require_vision
 from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torch_available, is_vision_available
 
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 4b3051a13fd4f3..b78e7775515e9f 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -30,11 +30,11 @@
 from typing import Dict, List, Tuple
 
 import numpy as np
-
-import transformers
 from huggingface_hub import HfFolder, delete_repo, set_access_token
 from huggingface_hub.file_download import http_get
 from requests.exceptions import HTTPError
+
+import transformers
 from transformers import (
     AutoConfig,
     AutoModel,
@@ -108,9 +108,9 @@
 
 if is_torch_available():
     import torch
+    from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from torch import nn
 
-    from test_module.custom_modeling import CustomModel, NoSuperInitModel
     from transformers import (
         BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
         MODEL_MAPPING,
@@ -160,6 +160,7 @@ def forward(self, x):
 
 if is_flax_available():
     import jax.numpy as jnp
+
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
@@ -183,7 +184,6 @@ def _config_zero_init(config):
 
 @require_torch
 class ModelTesterMixin:
-
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
@@ -417,7 +417,6 @@ def test_save_load_fast_init_to_base(self):
             base_class = base_class[0]
 
         for model_class in self.all_model_classes:
-
             if model_class == base_class:
                 continue
 
@@ -706,7 +705,6 @@ def test_torchscript_output_hidden_state(self):
 
     # This is copied from `torch/testing/_internal/jit_utils.py::clear_class_registry`
     def clear_torch_jit_class_registry(self):
-
         torch._C._jit_clear_class_registry()
         torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()
         # torch 1.8 has no `_clear_class_state` in `torch.jit._state`
@@ -1512,7 +1510,6 @@ def test_correct_missing_keys(self):
             base_model_prefix = model.base_model_prefix
 
             if hasattr(model, base_model_prefix):
-
                 extra_params = {k: v for k, v in model.named_parameters() if not k.startswith(base_model_prefix)}
                 extra_params.update({k: v for k, v in model.named_buffers() if not k.startswith(base_model_prefix)})
                 # Some models define this as None
@@ -1854,7 +1851,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
-
         tf_inputs_dict = {}
         for key, tensor in pt_inputs_dict.items():
             # skip key that does not exist in tf
@@ -1875,7 +1871,6 @@ def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
         return tf_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-
         tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -1907,7 +1902,6 @@ def test_pt_tf_model_equivalence(self):
         import transformers
 
         for model_class in self.all_model_classes:
-
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
@@ -2544,7 +2538,6 @@ def test_problem_types(self):
 
             for problem_type in problem_types:
                 with self.subTest(msg=f"Testing {model_class} with {problem_type['title']}"):
-
                     config.problem_type = problem_type["title"]
                     config.num_labels = problem_type["num_labels"]
 
diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py
index fe1cb694350daf..c9fe7cc4719b6c 100644
--- a/tests/test_modeling_flax_common.py
+++ b/tests/test_modeling_flax_common.py
@@ -21,10 +21,10 @@
 from typing import List, Tuple
 
 import numpy as np
-
-import transformers
 from huggingface_hub import HfFolder, delete_repo, set_access_token
 from requests.exceptions import HTTPError
+
+import transformers
 from transformers import BertConfig, is_flax_available, is_torch_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import (
@@ -48,6 +48,7 @@
     from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
     from flax.serialization import from_bytes
     from flax.traverse_util import flatten_dict, unflatten_dict
+
     from transformers import (
         FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
         FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
@@ -275,7 +276,6 @@ def test_equivalence_pt_to_flax(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # Output all for aggressive testing
                 config.output_hidden_states = True
                 config.output_attentions = self.has_attentions
@@ -328,7 +328,6 @@ def test_equivalence_flax_to_pt(self):
 
         for model_class in self.all_model_classes:
             with self.subTest(model_class.__name__):
-
                 # Output all for aggressive testing
                 config.output_hidden_states = True
                 config.output_attentions = self.has_attentions
@@ -569,7 +568,6 @@ def model_jitted(input_ids, attention_mask=None, **kwargs):
 
                 self.assertEqual(len(outputs), len(jitted_outputs))
                 for jitted_output, output in zip(jitted_outputs, outputs):
-
                     self.assertEqual(jitted_output.shape, output.shape)
 
     def test_forward_signature(self):
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index eaf8f82e78e88a..ced3c0f86afe6d 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -28,10 +28,10 @@
 from typing import List, Tuple, get_type_hints
 
 from datasets import Dataset
-
 from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
 from huggingface_hub.file_download import http_get
 from requests.exceptions import HTTPError
+
 from transformers import is_tf_available, is_torch_available
 from transformers.configuration_utils import PretrainedConfig
 from transformers.models.auto import get_values
@@ -157,7 +157,6 @@ def _return_type_has_loss(model):
 
 @require_tf
 class TFModelTesterMixin:
-
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
@@ -618,7 +617,6 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, nam
             )
 
     def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
-
         pt_inputs_dict = {}
         for name, key in tf_inputs_dict.items():
             if type(key) == bool:
@@ -638,7 +636,6 @@ def prepare_pt_inputs_from_tf_inputs(self, tf_inputs_dict):
         return pt_inputs_dict
 
     def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):
-
         pt_inputs_dict = self.prepare_pt_inputs_from_tf_inputs(tf_inputs_dict)
 
         # send pytorch inputs to the correct device
@@ -670,7 +667,6 @@ def test_pt_tf_model_equivalence(self):
         import transformers
 
         for model_class in self.all_model_classes:
-
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
             # Output all for aggressive testing
@@ -1069,7 +1065,6 @@ def test_determinism(self):
             self.assertLessEqual(max_diff, 1e-5)
 
     def test_model_outputs_equivalence(self):
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
diff --git a/tests/test_sequence_feature_extraction_common.py b/tests/test_sequence_feature_extraction_common.py
index 0114f4016da695..710ad01250f9e9 100644
--- a/tests/test_sequence_feature_extraction_common.py
+++ b/tests/test_sequence_feature_extraction_common.py
@@ -23,7 +23,6 @@
 
 
 class SequenceFeatureExtractionTestMixin(FeatureExtractionSavingTestMixin):
-
     # to overwrite at feature extractactor specific tests
     feat_extract_tester = None
     feature_extraction_class = None
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 8ca460449e249c..83b857cda9ac03 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -34,6 +34,7 @@
 from huggingface_hub.file_download import http_get
 from parameterized import parameterized
 from requests.exceptions import HTTPError
+
 from transformers import (
     AlbertTokenizer,
     AlbertTokenizerFast,
@@ -131,7 +132,6 @@ def merge_model_tokenizer_mappings(
 
 
 class TokenizerTesterMixin:
-
     tokenizer_class = None
     rust_tokenizer_class = None
     test_slow_tokenizer = True
@@ -915,7 +915,6 @@ def test_encode_decode_with_spaces(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 new_toks = [
                     AddedToken("[ABC]", normalized=False),
                     AddedToken("[DEF]", normalized=False),
@@ -953,7 +952,6 @@ def test_mask_output(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if (
                     tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
                     and "token_type_ids" in tokenizer.model_input_names
@@ -1004,7 +1002,6 @@ def test_number_of_added_tokens(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 seq_0 = "Test this method."
                 seq_1 = "With these inputs."
 
@@ -2140,7 +2137,6 @@ def test_pretokenized_inputs(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)  # , add_prefix_space=True)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
                     continue
 
@@ -2373,7 +2369,6 @@ def test_torch_encode_plus_sent_to_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
-
                 if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
                     return
 
@@ -2956,7 +2951,6 @@ def test_batch_encode_dynamic_overflowing(self):
             tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name}, {tokenizer.__class__.__name__})"):
-
                 if is_torch_available():
                     returned_tensor = "pt"
                 elif is_tf_available():
@@ -3579,7 +3573,6 @@ def test_compare_prepare_for_model(self):
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-
                 added_tokens = [AddedToken("<special>", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index a655b84dc16c68..186fabb7aea086 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -23,14 +23,12 @@
 
 import numpy as np
 
-# Ensure there are no circular imports when importing the parent class
-from transformers import PreTrainedTokenizerFast
-
 from transformers import (
     BatchEncoding,
     BertTokenizer,
     BertTokenizerFast,
     PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
     TensorType,
     TokenSpan,
     is_tokenizers_available,
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 47f3ebbc442ecd..a09865448d789d 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -29,10 +29,10 @@
 from unittest.mock import Mock, patch
 
 import numpy as np
-
 from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
 from parameterized import parameterized
 from requests.exceptions import HTTPError
+
 from transformers import (
     AutoTokenizer,
     IntervalStrategy,
@@ -565,7 +565,6 @@ def test_adafactor_lr_none(self):
     @require_torch_gpu
     @require_torch_bf16_gpu
     def test_mixed_bf16(self):
-
         # very basic test
         trainer = get_regression_trainer(learning_rate=0.1, bf16=True)
         trainer.train()
@@ -580,7 +579,6 @@ def test_mixed_bf16(self):
     @require_torch_gpu
     @require_torch_tf32
     def test_tf32(self):
-
         # very basic test
         trainer = get_regression_trainer(learning_rate=0.1, tf32=True)
         trainer.train()
@@ -1289,7 +1287,6 @@ def test_resume_training_with_randomness(self):
     @require_accelerate
     @require_torch_non_multi_gpu
     def test_auto_batch_size_finder(self):
-
         if torch.cuda.is_available():
             torch.backends.cudnn.deterministic = True
 
@@ -1736,7 +1733,6 @@ def check_mem_metrics(self, trainer, check_func):
             check_func("test_mem_gpu_alloc_delta", metrics)
 
     def test_mem_metrics(self):
-
         # with mem metrics enabled
         trainer = get_regression_trainer(skip_memory_metrics=False)
         self.check_mem_metrics(trainer, self.assertIn)
@@ -1747,7 +1743,6 @@ def test_mem_metrics(self):
 
     @require_torch_gpu
     def test_fp16_full_eval(self):
-
         # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
         # it's using pretty large safety margins, but small enough to detect broken functionality.
         debug = 0
@@ -2467,7 +2462,6 @@ class MyTrialShortNamer(TrialShortNamer):
             DEFAULTS = {"a": 0, "b": 0}
 
         def hp_space(trial):
-
             return {
                 "method": "random",
                 "metric": {},
diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py
index 68d07e0f60f331..5fa6edb1c88f1b 100644
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -66,7 +66,6 @@ def forward(self, input_ids, labels=None):
 class TestTrainerDistributedNeuronCore(TestCasePlus):
     @require_torch_neuroncore
     def test_trainer(self):
-
         distributed_args = f"""
             -m torch.distributed.launch
             --nproc_per_node=2
@@ -83,7 +82,6 @@ def test_trainer(self):
 class TestTrainerDistributed(TestCasePlus):
     @require_torch_multi_gpu
     def test_trainer(self):
-
         distributed_args = f"""
             -m torch.distributed.launch
             --nproc_per_node={torch.cuda.device_count()}
diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py
index da824f4743823d..0ad3c9c2ac4672 100644
--- a/tests/utils/test_hf_argparser.py
+++ b/tests/utils/test_hf_argparser.py
@@ -24,6 +24,7 @@
 from typing import List, Optional
 
 import yaml
+
 from transformers import HfArgumentParser, TrainingArguments
 from transformers.hf_argparser import make_choice_type_function, string_to_bool
 
diff --git a/tests/utils/test_hub_utils.py b/tests/utils/test_hub_utils.py
index c8c7d0faad70c1..272ea26fa32e1a 100644
--- a/tests/utils/test_hub_utils.py
+++ b/tests/utils/test_hub_utils.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 
 from requests.exceptions import HTTPError
+
 from transformers.utils import (
     CONFIG_NAME,
     FLAX_WEIGHTS_NAME,
diff --git a/tests/utils/test_logging.py b/tests/utils/test_logging.py
index 81f3d9144ad78f..5f0b5fe32533e5 100644
--- a/tests/utils/test_logging.py
+++ b/tests/utils/test_logging.py
@@ -15,8 +15,9 @@
 import os
 import unittest
 
-import transformers.models.bart.tokenization_bart
 from huggingface_hub.utils import are_progress_bars_disabled
+
+import transformers.models.bart.tokenization_bart
 from transformers import logging
 from transformers.testing_utils import CaptureLogger, mockenv, mockenv_context
 from transformers.utils.logging import disable_progress_bar, enable_progress_bar
diff --git a/tests/utils/test_modeling_tf_core.py b/tests/utils/test_modeling_tf_core.py
index 6a68718aac94f3..77958335072353 100644
--- a/tests/utils/test_modeling_tf_core.py
+++ b/tests/utils/test_modeling_tf_core.py
@@ -62,7 +62,6 @@
 
 @require_tf
 class TFCoreModelTesterMixin:
-
     model_tester = None
     all_model_classes = ()
     all_generative_model_classes = ()
diff --git a/tests/utils/test_offline.py b/tests/utils/test_offline.py
index 708accc7a63885..1fda8804a88a88 100644
--- a/tests/utils/test_offline.py
+++ b/tests/utils/test_offline.py
@@ -22,7 +22,6 @@
 class OfflineTests(TestCasePlus):
     @require_torch
     def test_offline_mode(self):
-
         # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
         # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
         # while running an external program
@@ -108,7 +107,6 @@ def offline_socket(*args, **kwargs): raise socket.error("Faking flaky internet")
 
     @require_torch
     def test_offline_mode_sharded_checkpoint(self):
-
         # this test is a bit tricky since TRANSFORMERS_OFFLINE can only be changed before
         # `transformers` is loaded, and it's too late for inside pytest - so we are changing it
         # while running an external program
diff --git a/tests/utils/test_skip_decorators.py b/tests/utils/test_skip_decorators.py
index 89ff0e3bafdc2b..6888fea23cffd4 100644
--- a/tests/utils/test_skip_decorators.py
+++ b/tests/utils/test_skip_decorators.py
@@ -32,8 +32,8 @@
 import unittest
 
 import pytest
-
 from parameterized import parameterized
+
 from transformers.testing_utils import require_torch, require_torch_gpu, slow, torch_device
 
 
diff --git a/tests/utils/test_versions_utils.py b/tests/utils/test_versions_utils.py
index 6bd77218d69feb..e691a2bcc383e1 100644
--- a/tests/utils/test_versions_utils.py
+++ b/tests/utils/test_versions_utils.py
@@ -84,7 +84,6 @@ def test_core(self):
                 self.assertIn("need one of ", str(e))
 
     def test_python(self):
-
         # matching requirement
         require_version("python>=3.6.0")
 
diff --git a/utils/check_copies.py b/utils/check_copies.py
index 48c1096f2b75d1..b10da732d32cf3 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -177,7 +177,7 @@ def blackify(code):
     has_indent = len(get_indent(code)) > 0
     if has_indent:
         code = f"class Bla:\n{code}"
-    mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119, preview=True)
+    mode = black.Mode(target_versions={black.TargetVersion.PY37}, line_length=119)
     result = black.format_str(code, mode=mode)
     result, _ = style_docstrings_in_code(result)
     return result[len("class Bla:\n") :] if has_indent else result
diff --git a/utils/check_doctest_list.py b/utils/check_doctest_list.py
index 330832e04cb91a..c81c3d8e212ed4 100644
--- a/utils/check_doctest_list.py
+++ b/utils/check_doctest_list.py
@@ -22,7 +22,6 @@
 
 
 if __name__ == "__main__":
-
     doctest_file_path = os.path.join(REPO_PATH, "utils/documentation_tests.txt")
     non_existent_paths = []
     with open(doctest_file_path) as fp:
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 37fd498b64e5f5..cf921a7024a47f 100755
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -445,7 +445,7 @@ def get_model_test_files():
             path = os.path.join(target_dir, file_or_dir)
             if os.path.isfile(path):
                 filename = os.path.split(path)[-1]
-                if "test_modeling" in filename and not os.path.splitext(filename)[0] in _ignore_files:
+                if "test_modeling" in filename and os.path.splitext(filename)[0] not in _ignore_files:
                     file = os.path.join(*path.split(os.sep)[1:])
                     test_files.append(file)
 
diff --git a/utils/check_self_hosted_runner.py b/utils/check_self_hosted_runner.py
index f7303366ea7840..0eadfcf5f9c0e1 100644
--- a/utils/check_self_hosted_runner.py
+++ b/utils/check_self_hosted_runner.py
@@ -4,7 +4,6 @@
 
 
 def get_runner_status(target_runners, token):
-
     offline_runners = []
 
     cmd = (
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index e2844f6aee12f3..47c150d6e84c68 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -25,10 +25,10 @@
 import tempfile
 from pathlib import Path
 
-from datasets import load_dataset
-
 from check_config_docstrings import get_checkpoint_from_config_class
+from datasets import load_dataset
 from huggingface_hub import Repository, create_repo, upload_folder
+
 from transformers import (
     CONFIG_MAPPING,
     FEATURE_EXTRACTOR_MAPPING,
@@ -350,7 +350,6 @@ def get_tiny_config(config_class, **model_tester_kwargs):
 
 
 def convert_tokenizer(tokenizer_fast: PreTrainedTokenizerFast):
-
     new_tokenizer = tokenizer_fast.train_new_from_iterator(training_ds["text"], TARGET_VOCAB_SIZE, show_progress=False)
 
     # Make sure it at least runs
@@ -361,7 +360,6 @@ def convert_tokenizer(tokenizer_fast: PreTrainedTokenizerFast):
 
 
 def convert_feature_extractor(feature_extractor, tiny_config):
-
     to_convert = False
     kwargs = {}
     if hasattr(tiny_config, "image_size"):
@@ -574,7 +572,6 @@ def upload_model(model_dir, organization):
         raise ValueError(error)
 
     with tempfile.TemporaryDirectory() as tmpdir:
-
         repo = Repository(local_dir=tmpdir, clone_from=f"{organization}/{repo_name}")
         repo.git_pull()
         shutil.copytree(model_dir, tmpdir, dirs_exist_ok=True)
@@ -599,7 +596,6 @@ def upload_model(model_dir, organization):
 
 
 def build_composite_models(config_class, output_dir):
-
     import tempfile
 
     from transformers import (
@@ -668,7 +664,6 @@ def build_composite_models(config_class, output_dir):
         tf_model_class = None
 
     with tempfile.TemporaryDirectory() as tmpdir:
-
         try:
             # build encoder
             models_to_create = {"processor": encoder_processor, "pytorch": (encoder_class,), "tensorflow": []}
@@ -761,7 +756,6 @@ def get_token_id_from_tokenizer(token_id_name, tokenizer, original_token_id):
 
 
 def get_config_overrides(config_class, processors):
-
     config_overrides = {}
 
     # Check if there is any tokenizer (prefer fast version if any)
@@ -990,7 +984,6 @@ def build(config_class, models_to_create, output_dir):
 
 
 def build_failed_report(results, include_warning=True):
-
     failed_results = {}
     for config_name in results:
         if "error" in results[config_name]:
@@ -1021,7 +1014,6 @@ def build_failed_report(results, include_warning=True):
 
 
 def build_simple_report(results):
-
     text = ""
     failed_text = ""
     for config_name in results:
@@ -1040,7 +1032,6 @@ def build_simple_report(results):
 
 
 if __name__ == "__main__":
-
     clone_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
     if os.getcwd() != clone_path:
         raise ValueError(f"This script should be run from the root of the clone of `transformers` {clone_path}")
diff --git a/utils/custom_init_isort.py b/utils/custom_init_isort.py
index c17ce139569f3e..d250ce7e64815a 100644
--- a/utils/custom_init_isort.py
+++ b/utils/custom_init_isort.py
@@ -96,6 +96,7 @@ def _inner(x):
 
 def sort_objects(objects, key=None):
     "Sort a list of `objects` following the rules of isort. `key` optionally maps an object to a str."
+
     # If no key is provided, we use a noop.
     def noop(x):
         return x
@@ -117,6 +118,7 @@ def sort_objects_in_import(import_statement):
     """
     Return the same `import_statement` but with objects properly sorted.
     """
+
     # This inner function sort imports between [ ].
     def _replace(match):
         imports = match.groups()[0]
diff --git a/utils/extract_warnings.py b/utils/extract_warnings.py
index 48912ea6f58f72..cb609e861503b8 100644
--- a/utils/extract_warnings.py
+++ b/utils/extract_warnings.py
@@ -5,6 +5,7 @@
 import zipfile
 
 from get_ci_error_statistics import download_artifact, get_artifacts_links
+
 from transformers import logging
 
 
diff --git a/utils/get_ci_error_statistics.py b/utils/get_ci_error_statistics.py
index 790ec5e3d565c3..b6642dce9c0d6f 100644
--- a/utils/get_ci_error_statistics.py
+++ b/utils/get_ci_error_statistics.py
@@ -209,7 +209,6 @@ def make_github_table_per_model(reduced_by_model):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
diff --git a/utils/notification_service.py b/utils/notification_service.py
index da315dc56aef3d..9967db447e772f 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -413,7 +413,6 @@ def payload(self) -> str:
 
     @staticmethod
     def error_out(title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=False):
-
         blocks = []
         title_block = {"type": "header", "text": {"type": "plain_text", "text": title}}
         blocks.append(title_block)
@@ -691,7 +690,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
 
 
 if __name__ == "__main__":
-
     runner_status = os.environ.get("RUNNER_STATUS")
     runner_env_status = os.environ.get("RUNNER_ENV_STATUS")
     setup_status = os.environ.get("SETUP_STATUS")
@@ -832,7 +830,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
 
                 for line in artifact["summary_short"].split("\n"):
                     if re.search("FAILED", line):
-
                         line = line.replace("FAILED ", "")
                         line = line.split()[0].replace("\n", "")
 
@@ -897,7 +894,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
     }
 
     for key in additional_results.keys():
-
         # If a whole suite of test fails, the artifact isn't available.
         if additional_files[key] not in available_artifacts:
             additional_results[key]["error"] = True
diff --git a/utils/notification_service_doc_tests.py b/utils/notification_service_doc_tests.py
index 7d5605c1cae3b0..3aabeaec0f3120 100644
--- a/utils/notification_service_doc_tests.py
+++ b/utils/notification_service_doc_tests.py
@@ -323,7 +323,6 @@ def add_path(self, path: str):
 
 
 if __name__ == "__main__":
-
     github_actions_job_links = get_job_links()
     available_artifacts = retrieve_available_artifacts()
 
@@ -359,7 +358,6 @@ def add_path(self, path: str):
         all_failures = extract_first_line_failure(artifact["failures_short"])
         for line in artifact["summary_short"].split("\n"):
             if re.search("FAILED", line):
-
                 line = line.replace("FAILED ", "")
                 line = line.split()[0].replace("\n", "")
 
diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index d6dea03a8ba6e3..e52d93fe62d925 100644
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -22,7 +22,6 @@
 
 import pandas as pd
 from datasets import Dataset
-
 from huggingface_hub import Repository